From 36660d4c356d4c6b71eb8df51e094ea36bfa2c06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Wed, 2 Mar 2022 14:02:42 +0800
Subject: [PATCH 01/41] [infrt] speed up the infrt ci. test=devvelop (#40032)

---
 paddle/scripts/infrt_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index 8d858647ea63d..a0132501387e0 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -102,9 +102,11 @@ function infrt_gen_and_build() {
 
 function create_fake_models() {
     cd ${PADDLE_ROOT}/build
+    cd python/dist/
     # create multi_fc model, this will generate "multi_fc_model"
     python3 -m pip uninstall -y paddlepaddle
-    python3 -m pip install paddlepaddle
+    python3 -m pip install  *whl
+    cd ${PADDLE_ROOT}/build
     python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py
 }
 

From 9070d5c5d85e15a04324b6a5f2f1e2c9a7ecc1b6 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Wed, 2 Mar 2022 14:08:19 +0800
Subject: [PATCH 02/41] test=document_fix;record py3 case time (#40018)

---
 paddle/scripts/paddle_build.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 9bef7e1285128..ed70a8638bf73 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -776,7 +776,9 @@ set +x
             tmpfile=$tmp_dir/$tmpfile_rand
             ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
         fi
-
+        ut_total_endTime_s=`date +%s`
+        echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
+        
         collect_failed_tests
         rm -f $tmp_dir/*
         exec_times=0

From b4d931e8bce97a12e9ac7a12ff6c0a11499002c7 Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Wed, 2 Mar 2022 14:23:35 +0800
Subject: [PATCH 03/41] [MLU] adapt matmul op (#39727)

* [MLU] adapt matmul op

* [MLU] fix phi namespace
---
 paddle/fluid/imperative/CMakeLists.txt        |   6 +-
 paddle/fluid/operators/matmul_op_mlu.cc       | 337 ++++++++++++++++++
 .../tests/unittests/mlu/test_matmul_op_mlu.py | 329 +++++++++++++++++
 3 files changed, 671 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/matmul_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py

diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index f198919b0c87b..e1ce705533ab4 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -46,8 +46,12 @@ if(WITH_GLOO)
     endif()
 endif()
 
+if(WITH_MLU)
+    SET(MLU_DEPS mlu_baseop)
+endif()
+
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS})
 else()
 cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor)
 endif()
diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc
new file mode 100644
index 0000000000000..d0c84c4751e78
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_mlu.cc
@@ -0,0 +1,337 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+static void Mul(const framework::ExecutionContext& ctx, const Tensor& X,
+                const Tensor& Y, Tensor* Out, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                  CNNL_NOT_PROPAGATE_NAN);
+  MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X),
+                    y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                    GetBasePtr(Out), ToCnnlDataType<T>(), alpha);
+}
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  Out->mutable_data<T>(ctx.GetPlace());
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                  y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                  GetBasePtr(Out));
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y, const float alpha) {
+  if (!Out->initialized()) {
+    Out->mutable_data<T>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits<float>::epsilon(),
+                    platform::errors::InvalidArgument(
+                        "MLU(matmul): alpha should be equal to 1.0! "
+                        "Other values are not supported yet."
+                        "But received alpha is %d.",
+                        alpha));
+
+  MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X),
+                       y_desc.get(), GetBasePtr(&Y), out_desc.get(),
+                       GetBasePtr(Out));
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& bcast_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = bcast_dims.size();
+  int64_t diff = bcast_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (bcast_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+
+  MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+  MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType<T>());
+
+  std::vector<int> reduce_dims(axes.begin(), axes.end());
+  MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD,
+                                ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN,
+                                CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES);
+
+  MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr,
+                  in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr,
+                  nullptr, out_desc.get(), GetBasePtr(out));
+}
+
+template <typename T>
+class MatMulMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* Out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    // Equal: [1, K] x [K, 1] = [1, 1] => [1]
+    const bool all_one_dim = (x_ndim == 1 && y_ndim == 1);
+    if (all_one_dim) {
+      Out->Resize({1, 1});
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      x_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < y_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.insert(temp_out_dims.end() - 1, 1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      y_ndim = 2;
+      // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim`
+      if (out_dims.size() < x_dims.size()) {
+        std::vector<int64_t> temp_out_dims(out_dims.begin(), out_dims.end());
+        temp_out_dims.push_back(1);
+        Out->Resize(phi::make_ddim(temp_out_dims));
+      }
+    }
+
+    const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (transpose_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
+    }
+
+    if (x_ndim == 2 && y_ndim == 2) {
+      // Case 2: [M, K] x [K, N] = [M, N]
+      MatMul2D<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    } else {
+      // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+      // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+      MatMulND<T>(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha);
+    }
+
+    if (phi::vectorize(Out->dims()) != out_dims) {
+      Out->Resize(phi::make_ddim(out_dims));
+    }
+  }
+};
+
+template <typename T>
+class MatMulGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");
+    auto* Y = ctx.Input<framework::Tensor>("Y");
+    auto* dOut = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    float alpha = static_cast<T>(ctx.Attr<float>("alpha"));
+
+    std::vector<int64_t> x_dims = phi::vectorize(X->dims());
+    std::vector<int64_t> y_dims = phi::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = phi::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
+
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      if (dX) {
+        Mul<T>(ctx, *dOut, *Y, dX, alpha);
+      }
+      if (dY) {
+        Mul<T>(ctx, *dOut, *X, dY, alpha);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(phi::make_ddim(x_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(phi::make_ddim(y_dims));
+      dout_temp.Resize(phi::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(phi::make_ddim(x_dims));
+        if (transpose_x) {
+          MatMul2D<T>(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha);
+        } else {
+          MatMul2D<T>(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha);
+        }
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        dY->Resize(phi::make_ddim(y_dims));
+        if (transpose_y) {
+          MatMul2D<T>(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha);
+        } else {
+          MatMul2D<T>(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha);
+        }
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N]
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_bcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_bcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2);
+
+    if (dX) {
+      Tensor dx_temp(X->type());
+      if (x_dims != x_bcast_dims) {
+        dx_temp.Resize(phi::make_ddim(x_bcast_dims));
+      } else {
+        dX->mutable_data<T>(ctx.GetPlace());
+        dx_temp.ShareDataWith(*dX);
+      }
+
+      if (transpose_x) {
+        MatMulND<T>(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha);
+      } else {
+        MatMulND<T>(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y,
+                    alpha);
+      }
+
+      if (x_dims != x_bcast_dims) {
+        ReduceDims<T>(ctx, x_dims, x_bcast_dims, dx_temp, dX);
+      }
+    }
+
+    if (dY) {
+      Tensor dy_temp(Y->type());
+      if (y_dims != y_bcast_dims) {
+        dy_temp.Resize(phi::make_ddim(y_bcast_dims));
+      } else {
+        dY->mutable_data<T>(ctx.GetPlace());
+        dy_temp.ShareDataWith(*dY);
+      }
+
+      if (transpose_y) {
+        MatMulND<T>(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha);
+      } else {
+        MatMulND<T>(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false,
+                    alpha);
+      }
+
+      if (y_dims != y_bcast_dims) {
+        ReduceDims<T>(ctx, y_dims, y_bcast_dims, dy_temp, dY);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel<float>,
+                       ops::MatMulMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel<float>,
+                       ops::MatMulGradMLUKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
new file mode 100644
index 0000000000000..adfff112e6be2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py
@@ -0,0 +1,329 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, ))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size, ))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float64")
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_mlu()
+        self.op_type = "matmul"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y,
+                               self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y,
+            'alpha': self.alpha
+        }
+        self.outputs = {'Out': Out}
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (100, )
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100, )
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(mlu): alpha will be supported in next version
+#--------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+#--------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()

From 0764fda25bb016bf143fc0a3aa93a3fb56b0cd73 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 2 Mar 2022 15:07:34 +0800
Subject: [PATCH 04/41] [Phi] Unify complex type trait and fix real imag bug
 (#40036)

* unify complex type trait and fix real imag bug

* add unittest for type tratis
---
 paddle/fluid/operators/angle_op.h             |   6 +-
 paddle/fluid/operators/eig_op.h               |  26 ++--
 paddle/fluid/operators/eigh_op.h              |   2 +-
 paddle/fluid/operators/eigvals_op.h           |  14 +-
 paddle/fluid/operators/imag_op.cc             |   2 +-
 paddle/fluid/operators/lstsq_op.h             |   4 +-
 .../operators/math/eigen_values_vectors.h     |   8 +-
 paddle/fluid/operators/math/inclusive_scan.h  |   2 +-
 paddle/fluid/operators/qr_op.cu               |  14 +-
 paddle/fluid/operators/qr_op.h                |  18 +--
 paddle/fluid/operators/real_op.cc             |   2 +-
 paddle/fluid/operators/svd_helper.h           |  12 +-
 paddle/fluid/operators/svd_op.h               |  12 +-
 paddle/phi/common/type_traits.h               |  96 ++++++++++++++
 paddle/phi/infermeta/unary.cc                 |   7 +
 paddle/phi/infermeta/unary.h                  |   2 +
 paddle/phi/kernels/cpu/abs_kernel.cc          |   6 +-
 paddle/phi/kernels/cpu/complex_kernel.cc      |   8 +-
 paddle/phi/kernels/funcs/complex_functors.h   | 123 ++++++------------
 paddle/phi/kernels/gpu/abs_kernel.cu          |  10 +-
 paddle/phi/kernels/gpu/complex_kernel.cu      |   8 +-
 .../phi/kernels/impl/abs_grad_kernel_impl.h   |   2 +-
 .../kernels/impl/complex_grad_kernel_impl.h   |   4 +-
 paddle/phi/kernels/impl/complex_kernel_impl.h |   8 +-
 paddle/phi/tests/common/test_data_type.cc     |  16 +++
 25 files changed, 247 insertions(+), 165 deletions(-)
 create mode 100644 paddle/phi/common/type_traits.h

diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h
index db5a3ea296194..116a8053db3ed 100644
--- a/paddle/fluid/operators/angle_op.h
+++ b/paddle/fluid/operators/angle_op.h
@@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel<T> {
 
     auto numel = x->numel();
     auto* x_data = x->data<T>();
-    auto* out_data = out->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real<T>)));
+    auto* out_data = out->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real<T>)));
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
     platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
@@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel<T> {
         ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto numel = d_out->numel();
-    auto* dout_data = d_out->data<phi::funcs::Real<T>>();
+    auto* dout_data = d_out->data<phi::dtype::Real<T>>();
     auto* x_data = x->data<T>();
     auto* dx_data = d_x->mutable_data<T>(
         ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index 03b25c6705ac5..e9c6c1eb7eced 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -87,19 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
   int values_stride = values->dims()[values->dims().size() - 1];
 
   Tensor rwork;
-  phi::funcs::Real<T>* rwork_data = nullptr;
+  phi::dtype::Real<T>* rwork_data = nullptr;
 
   rwork.Resize(phi::make_ddim({lda * 2}));
-  rwork_data = rwork.mutable_data<phi::funcs::Real<T>>(context.GetPlace());
+  rwork_data = rwork.mutable_data<phi::dtype::Real<T>>(context.GetPlace());
 
   // call lapackEig once to compute the size of work;
   T computed_work_size;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
       rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
 
   lwork = std::max<int>(
-      1, static_cast<int>(phi::funcs::Real<T>(computed_work_size)));
+      1, static_cast<int>(phi::dtype::Real<T>(computed_work_size)));
   Tensor work;
   work.Resize(phi::make_ddim({lwork}));
   T* work_data = work.mutable_data<T>(context.GetPlace());
@@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
     T* current_values = &values_data[i * values_stride];
     T* current_rvectors = &rvector_data[i * matrix_stride];
 
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
         ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
     PADDLE_ENFORCE_EQ(
@@ -207,23 +207,23 @@ class EigKernel : public framework::OpKernel<T> {
       origin_dim.push_back(last_item * 2);
       framework::DDim big_dim = phi::make_ddim(origin_dim);
 
-      real_values.mutable_data<phi::funcs::Real<T>>(big_dim,
+      real_values.mutable_data<phi::dtype::Real<T>>(big_dim,
                                                     context.GetPlace());
-      real_vectors.mutable_data<phi::funcs::Real<T>>(x->dims(),
+      real_vectors.mutable_data<phi::dtype::Real<T>>(x->dims(),
                                                      context.GetPlace());
 
-      ApplyEigKernel<DeviceContext, phi::funcs::Real<T>>(
+      ApplyEigKernel<DeviceContext, phi::dtype::Real<T>>(
           *x, &real_values, &real_vectors, context);
       auto dito = math::DeviceIndependenceTensorOperations<
-          DeviceContext, phi::funcs::Real<T>, Tout>(context);
+          DeviceContext, phi::dtype::Real<T>, Tout>(context);
 
       // 1. extract real part & imag part from real_values
       Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order});
       Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2});
 
       // 2. construct complex values
-      auto* real_part_data = real_part.data<phi::funcs::Real<T>>();
-      auto* imag_part_data = imag_part.data<phi::funcs::Real<T>>();
+      auto* real_part_data = real_part.data<phi::dtype::Real<T>>();
+      auto* imag_part_data = imag_part.data<phi::dtype::Real<T>>();
       int out_values_numel = out_values->numel();
       platform::ForRange<DeviceContext> for_range(
           context.template device_context<DeviceContext>(), out_values_numel);
@@ -236,7 +236,7 @@ class EigKernel : public framework::OpKernel<T> {
       Tensor real_vector_trans = dito.Transpose(real_vectors);
       Tensor out_vectors_trans;
       out_vectors_trans.mutable_data<Tout>(x->dims(), context.GetPlace());
-      ConstructComplexVectors<phi::funcs::Real<T>, Tout>(
+      ConstructComplexVectors<phi::dtype::Real<T>, Tout>(
           &out_vectors_trans, *out_values, real_vector_trans, context,
           batch_count, order);
       TransposeTwoAxis<DeviceContext, Tout>(out_vectors_trans, out_vectors,
@@ -272,7 +272,7 @@ void ComputeBackwardForComplexInput(
   // turn diag_unsqueezed into complex
   auto numel = diag_unsqueezed.numel();
   Tensor diag_unsqueezed_complex;
-  auto* data_diag_un = diag_unsqueezed.data<phi::funcs::Real<Tout>>();
+  auto* data_diag_un = diag_unsqueezed.data<phi::dtype::Real<Tout>>();
   auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data<Tout>(
       diag_unsqueezed.dims(), context.GetPlace(),
       static_cast<size_t>(numel * sizeof(Tout)));
diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h
index 294794877b32e..5279ec750935c 100644
--- a/paddle/fluid/operators/eigh_op.h
+++ b/paddle/fluid/operators/eigh_op.h
@@ -40,7 +40,7 @@ template <typename DeviceContext, typename T>
 class EighGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto& x_grad = *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     x_grad.mutable_data<T>(ctx.GetPlace());
     auto& output_w = *ctx.Input<Tensor>("Eigenvalues");
diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h
index 59eabfb29b97e..4627acc0d07de 100644
--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -48,7 +48,7 @@ struct PaddleComplex<
 template <typename T>
 using PaddleCType = typename PaddleComplex<T>::type;
 template <typename T>
-using Real = typename phi::funcs::Real<T>;
+using Real = typename phi::dtype::Real<T>;
 
 static void SpiltBatchSquareMatrix(const Tensor& input,
                                    std::vector<Tensor>* output) {
@@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_work_mem, work_mem));
 
   int64_t rwork_mem = rwork->memory_size();
-  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real<T>);
+  int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real<T>);
   PADDLE_ENFORCE_GE(
       rwork_mem, required_rwork_mem,
       platform::errors::InvalidArgument(
@@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
           required_rwork_mem, rwork_mem));
 
   int info = 0;
-  phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+  phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
       'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
       static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
       work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
-      rwork->template data<phi::funcs::Real<T>>(), &info);
+      rwork->template data<phi::dtype::Real<T>>(), &info);
 
   std::string name = "framework::platform::dynload::cgeev_";
   if (framework::TransToProtoVarType(input.dtype()) ==
@@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
     // query workspace size
     T qwork;
     int info;
-    phi::funcs::lapackEig<T, phi::funcs::Real<T>>(
+    phi::funcs::lapackEig<T, phi::dtype::Real<T>>(
         'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
         static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
-        static_cast<Real<T>*>(NULL), &info);
+        static_cast<phi::dtype::Real<T>*>(NULL), &info);
     int64_t lwork = static_cast<int64_t>(qwork);
 
     Tensor work, rwork;
@@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel<T> {
     }
     if (framework::IsComplexType(
             framework::TransToProtoVarType(input->dtype()))) {
-      rwork.mutable_data<phi::funcs::Real<T>>(phi::make_ddim({n_dim << 1}),
+      rwork.mutable_data<phi::dtype::Real<T>>(phi::make_ddim({n_dim << 1}),
                                               ctx.GetPlace());
     }
 
diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc
index 33b68d68992dd..567a69f383d1c 100644
--- a/paddle/fluid/operators/imag_op.cc
+++ b/paddle/fluid/operators/imag_op.cc
@@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer,
 }  // namespace paddle
 
 DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+                            PT_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index a4c3d1c81fb3e..3cbbc62e7bec9 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -46,7 +46,7 @@ template <typename DeviceContext, typename T>
 class LstsqCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
 
     const Tensor& x = *context.Input<Tensor>("X");
     auto y = context.Input<Tensor>("Y");
@@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
                               &rank_32, &wkopt, lwork, &rwkopt, &info);
     }
 
-    lwork = std::max<int>(1, static_cast<int>(phi::funcs::Real<T>(wkopt)));
+    lwork = std::max<int>(1, static_cast<int>(phi::dtype::Real<T>(wkopt)));
     Tensor work;
     work.Resize(phi::make_ddim({lwork}));
     T* work_data = work.mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h
index 9b6ebf73d9b09..1ade2190bb96e 100644
--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -63,7 +63,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto dito =
@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
     for (auto i = 0; i < batch_size; i++) {
       auto *value_data = out_value + i * values_stride;
       auto *input_data = input_vector + i * vector_stride;
-      phi::funcs::lapackEigh<T, phi::funcs::Real<T>>(
+      phi::funcs::lapackEigh<T, phi::dtype::Real<T>>(
           jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
           rwork_data, lrwork, iwork_data, liwork, &info);
       CheckEighResult(i, info);
@@ -151,7 +151,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
   void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
                   Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
                   bool has_vectors) {
-    using ValueType = phi::funcs::Real<T>;
+    using ValueType = phi::dtype::Real<T>;
     auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
@@ -233,7 +233,7 @@ struct MatrixEighFunctor<platform::CUDADeviceContext, T> {
     }
   }
 
-  using ValueType = phi::funcs::Real<T>;
+  using ValueType = phi::dtype::Real<T>;
   inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
                         cublasFillMode_t uplo, int n, const T *A, int lda,
                         const ValueType *W, int *lwork) const;
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 38692a646111e..9994ccc10cb13 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y,
                                                        size_t num_rows,
                                                        size_t row_size, T init,
                                                        BinaryOp op) {
-  using RealT = phi::funcs::Real<T>;
+  using RealT = phi::dtype::Real<T>;
   constexpr auto kSharedBufferSize =
       framework::IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
   __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu
index 5e841a097fed7..a57a8d5cf8b7f 100644
--- a/paddle/fluid/operators/qr_op.cu
+++ b/paddle/fluid/operators/qr_op.cu
@@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel<T> {
     int tau_stride = min_mn;
 
     if (compute_q) {
-      q.mutable_data<phi::funcs::Real<T>>(
+      q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    r.mutable_data<phi::funcs::Real<T>>(
+    r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     auto dito =
         math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
@@ -71,9 +71,9 @@ class QrGPUKernel : public framework::OpKernel<T> {
     // Note: allocate temporary tensors because of lacking in-place operatios.
     // Prepare qr
     Tensor qr;
-    qr.mutable_data<phi::funcs::Real<T>>(
+    qr.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * m * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * m * n * sizeof(phi::dtype::Real<T>)));
     // BatchedGeqrf performs computation in-place and 'qr' must be a copy of
     // input
     paddle::framework::TensorCopy(x, context.GetPlace(), &qr);
@@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel<T> {
           for (int i = 0; i < batch_size; ++i) {
             memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride),
                          dev_ctx.GetPlace(), (qr_data + i * qr_stride),
-                         qr_stride * sizeof(phi::funcs::Real<T>),
+                         qr_stride * sizeof(phi::dtype::Real<T>),
                          dev_ctx.stream());
           }
           BatchedOrgqr<platform::CUDADeviceContext, T>(
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index cef9371fea099..f09a07e96cd34 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel<T> {
     int q_stride = m * k;
     int r_stride = k * n;
 
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     T* q_data = nullptr;
     if (compute_q) {
-      q_data = q.mutable_data<phi::funcs::Real<T>>(
+      q_data = q.mutable_data<phi::dtype::Real<T>>(
           context.GetPlace(),
-          size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+          size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
       memset(q_data, 0,
-             size_t(batch_size * m * k * sizeof(phi::funcs::Real<T>)));
+             size_t(batch_size * m * k * sizeof(phi::dtype::Real<T>)));
     }
-    auto* r_data = r.mutable_data<phi::funcs::Real<T>>(
+    auto* r_data = r.mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
-    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real<T>)));
+        size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
+    memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real<T>)));
 
     // Implement QR by calling Eigen
     for (int i = 0; i < batch_size; ++i) {
@@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     // Use a different name dA instead of dX
     framework::Tensor& dA =
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+    dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
@@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel<T> {
     } else {
       // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V]
       // Calculate dX and dY individually and concatenate them to get dA
-      dA.mutable_data<phi::funcs::Real<T>>(ctx.GetPlace());
+      dA.mutable_data<phi::dtype::Real<T>>(ctx.GetPlace());
 
       auto Y = dito.Slice(A, {-1}, {m}, {n});
       auto U = dito.Slice(R, {-1}, {0}, {m});
diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc
index 1f3691978b577..28a8484f539fc 100644
--- a/paddle/fluid/operators/real_op.cc
+++ b/paddle/fluid/operators/real_op.cc
@@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer,
 }  // namespace paddle
 
 DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor,
-                            PT_INFER_META(phi::UnchangedInferMeta));
+                            PT_INFER_META(phi::RealAndImagInferMeta));
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index bcb3ee44f0465..166f49999d552 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -105,7 +105,7 @@ struct RealMulComplexFunctor {
                                         "The image part of y must to be 0"
                                         "but got [%d]",
                                         y.imag));
-    return platform::complex<phi::funcs::Real<T>>(x.real * y.real,
+    return platform::complex<phi::dtype::Real<T>>(x.real * y.real,
                                                   x.imag * y.real);
   }
 };
@@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations {
   // batch_diag for CPU only
   Tensor BatchDiag(const Tensor& x, int batch) {
     Tensor out;
-    auto* x_data = x.data<phi::funcs::Real<T>>();
+    auto* x_data = x.data<phi::dtype::Real<T>>();
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
     auto x_dims = x.dims();
     int num_dims = x_dims.size();
@@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations {
   Tensor Real(const Tensor& x) {
     Tensor out;
     auto numel = x.numel();
-    auto* out_data = out.mutable_data<phi::funcs::Real<T>>(
+    auto* out_data = out.mutable_data<phi::dtype::Real<T>>(
         x.dims(), context.GetPlace(),
-        static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+        static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
     auto* x_data = x.data<T>();
     auto for_range = GetForRange(numel);
     phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h
index f5e451ac7054d..42a847206a3cb 100644
--- a/paddle/fluid/operators/svd_op.h
+++ b/paddle/fluid/operators/svd_op.h
@@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel<T> {
     int col_u = full ? rows : k;
     int col_v = full ? cols : k;
     int batches = numel / (rows * cols);
-    auto* U_out = U->mutable_data<phi::funcs::Real<T>>(
+    auto* U_out = U->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * rows * col_u * sizeof(phi::funcs::Real<T>)));
-    auto* VH_out = VH->mutable_data<phi::funcs::Real<T>>(
+        size_t(batches * rows * col_u * sizeof(phi::dtype::Real<T>)));
+    auto* VH_out = VH->mutable_data<phi::dtype::Real<T>>(
         context.GetPlace(),
-        size_t(batches * col_v * cols * sizeof(phi::funcs::Real<T>)));
-    auto* S_out = S->mutable_data<phi::funcs::Real<T>>(
-        context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real<T>)));
+        size_t(batches * col_v * cols * sizeof(phi::dtype::Real<T>)));
+    auto* S_out = S->mutable_data<phi::dtype::Real<T>>(
+        context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real<T>)));
     /*SVD Use the Eigen Library*/
     math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
   }
diff --git a/paddle/phi/common/type_traits.h b/paddle/phi/common/type_traits.h
new file mode 100644
index 0000000000000..ef894eee46835
--- /dev/null
+++ b/paddle/phi/common/type_traits.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/data_type.h"
+
+namespace phi {
+namespace dtype {
+
+template <bool B, typename T>
+struct cond {
+  static constexpr bool value = B;
+  using type = T;
+};
+
+template <bool B, typename TrueF, typename FalseF>
+struct eval_if {
+  using type = typename TrueF::type;
+};
+
+template <typename TrueF, typename FalseF>
+struct eval_if<false, TrueF, FalseF> {
+  using type = typename FalseF::type;
+};
+
+template <bool B, typename T, typename F>
+using eval_if_t = typename eval_if<B, T, F>::type;
+
+template <typename Head, typename... Tail>
+struct select {
+  using type = eval_if_t<Head::value, Head, select<Tail...>>;
+};
+
+template <typename T>
+struct select<T> {
+  using type = T;
+};
+
+template <bool B, typename T>
+struct select<cond<B, T>> {
+  // last one had better be true!
+  static_assert(B, "No match select type!");
+  using type = T;
+};
+
+template <typename Head, typename... Tail>
+using select_t = typename select<Head, Tail...>::type;
+
+// runtime real and complex type conversion
+
+template <typename T>
+using Real = select_t<cond<std::is_same<T, complex<float>>::value, float>,
+                      cond<std::is_same<T, complex<double>>::value, double>,
+                      T>;
+
+template <typename T>
+using Complex = select_t<cond<std::is_same<T, float>::value, complex<float>>,
+                         cond<std::is_same<T, double>::value, complex<double>>,
+                         T>;
+
+inline DataType ToReal(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::COMPLEX64:
+      return phi::DataType::FLOAT32;
+    case phi::DataType::COMPLEX128:
+      return phi::DataType::FLOAT64;
+    default:
+      return dtype;
+  }
+}
+
+inline DataType ToComplex(DataType dtype) {
+  switch (dtype) {
+    case phi::DataType::FLOAT32:
+      return phi::DataType::COMPLEX64;
+    case phi::DataType::FLOAT64:
+      return phi::DataType::COMPLEX128;
+    default:
+      return dtype;
+  }
+}
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 983e016226492..fbd9259a83f86 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <set>
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
@@ -51,6 +52,12 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
   out->share_meta(x);
 }
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) {
+  out->set_dims(x.dims());
+  out->set_dtype(dtype::ToReal(x.dtype()));
+  out->set_layout(x.layout());
+}
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index a2d779e0f7093..3c0628981af7c 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -39,6 +39,8 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x,
                                  int axis,
                                  MetaTensor* out);
 
+void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out);
+
 void FlattenInferMeta(const MetaTensor& x,
                       int start_axis,
                       int stop_axis,
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index efe7d090405df..9f89fc27a7167 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -25,9 +25,9 @@ template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  ctx.template Alloc<phi::funcs::Real<T>>(
-      out, size_t(x.numel() * sizeof(phi::funcs::Real<T>)));
-  auto* out_data = out->data<phi::funcs::Real<T>>();
+  ctx.template Alloc<phi::dtype::Real<T>>(
+      out, size_t(x.numel() * sizeof(phi::dtype::Real<T>)));
+  auto* out_data = out->data<phi::dtype::Real<T>>();
 
   phi::funcs::ForRange<Context> for_range(ctx, numel);
   phi::funcs::AbsFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 801502e16737d..859d5a84527a2 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -37,11 +37,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    CPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h
index 86dbdd099ecde..8b292cb5dc52e 100644
--- a/paddle/phi/kernels/funcs/complex_functors.h
+++ b/paddle/phi/kernels/funcs/complex_functors.h
@@ -20,56 +20,12 @@ limitations under the License. */
 #include <type_traits>
 
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace phi {
 namespace funcs {
 
-template <bool B, typename T>
-struct cond {
-  static constexpr bool value = B;
-  using type = T;
-};
-
-template <bool B, typename TrueF, typename FalseF>
-struct eval_if {
-  using type = typename TrueF::type;
-};
-
-template <typename TrueF, typename FalseF>
-struct eval_if<false, TrueF, FalseF> {
-  using type = typename FalseF::type;
-};
-
-template <bool B, typename T, typename F>
-using eval_if_t = typename eval_if<B, T, F>::type;
-
-template <typename Head, typename... Tail>
-struct select {
-  using type = eval_if_t<Head::value, Head, select<Tail...>>;
-};
-
-template <typename T>
-struct select<T> {
-  using type = T;
-};
-
-template <bool B, typename T>
-struct select<cond<B, T>> {
-  // last one had better be true!
-  static_assert(B, "No match select type!");
-  using type = T;
-};
-
-template <typename Head, typename... Tail>
-using select_t = typename select<Head, Tail...>::type;
-
-template <typename T>
-using Real =
-    select_t<cond<std::is_same<T, phi::dtype::complex<float>>::value, float>,
-             cond<std::is_same<T, phi::dtype::complex<double>>::value, double>,
-             T>;
-
 template <typename T, typename RealT>
 using Complex = typename std::enable_if<!std::is_same<T, RealT>::value>::type;
 
@@ -91,9 +47,9 @@ template <typename T, typename Enable = void>
 struct RealFunctor;
 
 template <typename T>
-struct RealFunctor<T, Complex<T, Real<T>>> {
+struct RealFunctor<T, Complex<T, dtype::Real<T>>> {
  public:
-  RealFunctor(const T* input, Real<T>* output, int64_t numel)
+  RealFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -102,7 +58,7 @@ struct RealFunctor<T, Complex<T, Real<T>>> {
 
  private:
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -110,8 +66,8 @@ template <typename T, typename Enable = void>
 struct ImagFunctor;
 
 template <typename T>
-struct ImagFunctor<T, Complex<T, Real<T>>> {
-  ImagFunctor(const T* input, Real<T>* output, int64_t numel)
+struct ImagFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -119,7 +75,7 @@ struct ImagFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
@@ -127,8 +83,8 @@ template <typename T, typename Enable = void>
 struct AbsFunctor;
 
 template <typename T>
-struct AbsFunctor<T, Complex<T, Real<T>>> {
-  AbsFunctor(const T* input, Real<T>* output, int64_t numel)
+struct AbsFunctor<T, Complex<T, dtype::Real<T>>> {
+  AbsFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -136,12 +92,12 @@ struct AbsFunctor<T, Complex<T, Real<T>>> {
   }
 
   const T* input_;
-  Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 template <typename T>
-struct AbsFunctor<T, NoComplex<T, Real<T>>> {
+struct AbsFunctor<T, NoComplex<T, dtype::Real<T>>> {
   AbsFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -203,7 +159,10 @@ struct AbsGradCUDAFunctor<phi::dtype::complex<double>> {
 
 template <typename T>
 struct AbsGradFunctor {
-  AbsGradFunctor(const Real<T>* dout, const T* x, T* output, int64_t numel)
+  AbsGradFunctor(const dtype::Real<T>* dout,
+                 const T* x,
+                 T* output,
+                 int64_t numel)
       : dout_(dout), x_(x), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -214,7 +173,7 @@ struct AbsGradFunctor {
     }
   }
 
-  const Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* output_;
   int64_t numel_;
@@ -334,8 +293,8 @@ template <typename T, typename Enable = void>
 struct RealToComplexFunctor;
 
 template <typename T>
-struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct RealToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -343,7 +302,7 @@ struct RealToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = 0;
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -352,8 +311,8 @@ template <typename T, typename Enable = void>
 struct ImagToComplexFunctor;
 
 template <typename T>
-struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  ImagToComplexFunctor(const Real<T>* input, T* output, int64_t numel)
+struct ImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  ImagToComplexFunctor(const dtype::Real<T>* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -361,7 +320,7 @@ struct ImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_[idx];
   }
 
-  const Real<T>* input_;
+  const dtype::Real<T>* input_;
   T* output_;
   int64_t numel_;
 };
@@ -370,9 +329,9 @@ template <typename T, typename Enable = void>
 struct RealImagToComplexFunctor;
 
 template <typename T>
-struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
-  RealImagToComplexFunctor(const Real<T>* input_real,
-                           const Real<T>* input_imag,
+struct RealImagToComplexFunctor<T, Complex<T, dtype::Real<T>>> {
+  RealImagToComplexFunctor(const dtype::Real<T>* input_real,
+                           const dtype::Real<T>* input_imag,
                            T* output,
                            int64_t numel)
       : input_real_(input_real),
@@ -385,8 +344,8 @@ struct RealImagToComplexFunctor<T, Complex<T, Real<T>>> {
     output_[idx].imag = input_imag_[idx];
   }
 
-  const Real<T>* input_real_;
-  const Real<T>* input_imag_;
+  const dtype::Real<T>* input_real_;
+  const dtype::Real<T>* input_imag_;
   T* output_;
   int64_t numel_;
 };
@@ -423,8 +382,8 @@ struct AngleFunctor;
 
 // angel function for complex
 template <typename T>
-struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleFunctor(const T* input, phi::funcs::Real<T>* output, int64_t numel)
+struct AngleFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleFunctor(const T* input, dtype::Real<T>* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
@@ -432,13 +391,13 @@ struct AngleFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
   }
 
   const T* input_;
-  phi::funcs::Real<T>* output_;
+  dtype::Real<T>* output_;
   int64_t numel_;
 };
 
 // angel function for real
 template <typename T>
-struct AngleFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct AngleFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
   AngleFunctor(const T* input, T* output, int64_t numel)
       : input_(input), output_(output), numel_(numel) {}
 
@@ -456,25 +415,22 @@ struct AngleGradFunctor;
 
 // angle grad for complex
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::Complex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const {
     if (x_[idx] == T(0)) {
       dx_[idx] = T(0);
     } else {
-      const phi::funcs::Real<T> r_square =
+      const phi::dtype::Real<T> r_square =
           x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag;
       dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square,
                    dout_[idx] * x_[idx].real / r_square);
     }
   }
 
-  const phi::funcs::Real<T>* dout_;
+  const phi::dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
@@ -482,16 +438,13 @@ struct AngleGradFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
 
 // angle grad for real
 template <typename T>
-struct AngleGradFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
-  AngleGradFunctor(const phi::funcs::Real<T>* dout,
-                   const T* x,
-                   T* dx,
-                   int64_t numel)
+struct AngleGradFunctor<T, phi::funcs::NoComplex<T, dtype::Real<T>>> {
+  AngleGradFunctor(const dtype::Real<T>* dout, const T* x, T* dx, int64_t numel)
       : dout_(dout), x_(x), dx_(dx), numel_(numel) {}
 
   HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; }
 
-  const phi::funcs::Real<T>* dout_;
+  const dtype::Real<T>* dout_;
   const T* x_;
   T* dx_;
   int64_t numel_;
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index e122e6b1e9c8a..5c424316a83df 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -27,14 +27,14 @@ template <typename T, typename Enable = void>
 struct CudaAbsFunctor;
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::funcs::Real<T>>> {
-  __device__ __forceinline__ phi::funcs::Real<T> operator()(const T x) const {
+struct CudaAbsFunctor<T, phi::funcs::Complex<T, phi::dtype::Real<T>>> {
+  __device__ __forceinline__ phi::dtype::Real<T> operator()(const T x) const {
     return abs(x);
   }
 };
 
 template <typename T>
-struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
+struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::dtype::Real<T>>> {
   __device__ __forceinline__ T operator()(const T x) const {
     return std::abs(x);
   }
@@ -42,12 +42,12 @@ struct CudaAbsFunctor<T, phi::funcs::NoComplex<T, phi::funcs::Real<T>>> {
 
 template <typename T, typename Context>
 void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
-  ctx.template Alloc<phi::funcs::Real<T>>(out);
+  ctx.template Alloc<phi::dtype::Real<T>>(out);
   std::vector<const DenseTensor*> ins = {&x};
   std::vector<DenseTensor*> outs = {out};
   auto functor = CudaAbsFunctor<T>();
 
-  funcs::ElementwiseKernel<phi::funcs::Real<T>>(ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<phi::dtype::Real<T>>(ctx, ins, &outs, functor);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index d0b086718a444..e03e079581a9b 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -38,11 +38,15 @@ PD_REGISTER_KERNEL(real,
                    ALL_LAYOUT,
                    phi::RealKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
 
 PD_REGISTER_KERNEL(imag,
                    GPU,
                    ALL_LAYOUT,
                    phi::ImagKernel,
                    phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::complex<double>) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
+}
diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
index 78c25200bbd28..9dad40b57c916 100644
--- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h
@@ -47,7 +47,7 @@ void AbsGradKernel(const Context& ctx,
                    const DenseTensor& dout,
                    DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* x_data = x.data<T>();
 
   ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
index a10481284b17f..03896a2353dda 100644
--- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h
@@ -24,7 +24,7 @@ void RealGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
@@ -38,7 +38,7 @@ void ImagGradKernel(const Context& dev_ctx,
                     const DenseTensor& dout,
                     DenseTensor* dx) {
   auto numel = dout.numel();
-  auto* dout_data = dout.data<phi::funcs::Real<T>>();
+  auto* dout_data = dout.data<phi::dtype::Real<T>>();
   auto* dx_data =
       dev_ctx.template Alloc<T>(dx, static_cast<size_t>(numel * sizeof(T)));
 
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ff5cf86ed2ea2..72b1328833979 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -39,8 +39,8 @@ void RealKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::RealFunctor<T> functor(x_data, out_data, numel);
@@ -53,8 +53,8 @@ void ImagKernel(const Context& dev_ctx,
                 DenseTensor* out) {
   auto numel = x.numel();
   auto* x_data = x.data<T>();
-  auto* out_data = dev_ctx.template Alloc<phi::funcs::Real<T>>(
-      out, static_cast<size_t>(numel * sizeof(phi::funcs::Real<T>)));
+  auto* out_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(
+      out, static_cast<size_t>(numel * sizeof(phi::dtype::Real<T>)));
 
   phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
   phi::funcs::ImagFunctor<T> functor(x_data, out_data, numel);
diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc
index c962c68b4d5f2..5a1b41d796d33 100644
--- a/paddle/phi/tests/common/test_data_type.cc
+++ b/paddle/phi/tests/common/test_data_type.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/ext/exception.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/type_traits.h"
 
 namespace phi {
 namespace tests {
@@ -71,5 +72,20 @@ TEST(DataType, OStream) {
   }
 }
 
+TEST(TypeTraits, Complex) {
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX64),
+            phi::DataType::FLOAT32);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX128),
+            phi::DataType::FLOAT64);
+  EXPECT_EQ(phi::dtype::ToReal(phi::DataType::FLOAT32), phi::DataType::FLOAT32);
+
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT32),
+            phi::DataType::COMPLEX64);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT64),
+            phi::DataType::COMPLEX128);
+  EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::COMPLEX64),
+            phi::DataType::COMPLEX64);
+}
+
 }  // namespace tests
 }  // namespace phi

From 90ab7403753acad5c93b425f6a909a526aa57a3d Mon Sep 17 00:00:00 2001
From: Lijunhui <1578034415@qq.com>
Date: Wed, 2 Mar 2022 15:11:42 +0800
Subject: [PATCH 05/41] [KP] Activation op registration for XPU2. part 1/2
 (#40002)

---
 .../{activation_op.cu => activation_op.kps}   | 64 +++++++++++++++++++
 .../platform/device/xpu/xpu_op_kpfirst_list.h | 26 ++++++++
 2 files changed, 90 insertions(+)
 rename paddle/fluid/operators/{activation_op.cu => activation_op.kps} (94%)

diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps
similarity index 94%
rename from paddle/fluid/operators/activation_op.cu
rename to paddle/fluid/operators/activation_op.kps
index e578ad899e74b..e1afb3919f813 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.kps
@@ -1861,3 +1861,67 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(hard_swish, HardSwish, CudaHardSwishFunctor,                        \
           CudaHardSwishGradFunctor);
 FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL)
+
+#ifdef PADDLE_WITH_XPU_KP
+#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor,             \
+                                       grad_functor)                           \
+  REGISTER_OP_KERNEL(                                                          \
+      act_type, KP, plat::XPUPlace,                                            \
+      ops::ActivationCudaKernel<plat::XPUDeviceContext, ops::functor<float>>); \
+  REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace,                      \
+                     ops::ActivationGradCudaKernel<plat::XPUDeviceContext,     \
+                                                   ops::grad_functor<float>>);
+
+REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor,
+                               CudaLeakyReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor,
+                               CudaReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                               CudaSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor,
+                               CudaReciprocalGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor,
+                               CudaSoftplusGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor,
+                               CudaHardSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor,
+                               CudaCELUGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor,
+                               CudaSqrtGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor,
+                               CudaSquareGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor,
+                               CudaSiluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,
+                               CudaLogSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor,
+                               CudaSoftShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor,
+                               CudaZeroGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor,
+                               CudaLog1pGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor,
+                               CudaBReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor,
+                               CudaSoftReluGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor,
+                               CudaSoftsignGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor,
+                               CudaRelu6GradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor,
+                               CudaHardShrinkGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid,
+                               CudaHardSigmoidFunctor,
+                               CudaHardSigmoidGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor,
+                               CudaSwishGradFunctor);
+REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu,
+                               CudaThresholdedReluFunctor,
+                               CudaThresholdedReluGradFunctor);
+
+#endif  // PADDLE_WITH_XPU_KP
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
index f79ef8505d878..c5dff84723ccf 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h
@@ -30,6 +30,32 @@ XPUOpMap& get_kp_ops() {
   static XPUOpMap s_xpu_kp_kernels{
       {"elementwise_add",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      // activation op
+      {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"celu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"silu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softshrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"ceil", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log1p", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"brelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"soft_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softsign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_sigmoid",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
   };
 
   return s_xpu_kp_kernels;

From 244ae318c2fbfea0ab4315a17f6e6296c6be2624 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Wed, 2 Mar 2022 15:24:36 +0800
Subject: [PATCH 06/41] [fleet_executor] Add entrance of FleetExecutor in
 AnalysisPredictor for distributed inference (#39992)

---
 .../distributed/fleet_executor/carrier.cc     |  24 +-
 .../distributed/fleet_executor/carrier.h      |   7 +-
 .../fleet_executor/fleet_executor.cc          |  48 ++-
 .../fleet_executor/fleet_executor.h           |  10 +-
 .../distributed/fleet_executor/task_node.cc   |  11 +-
 .../distributed/fleet_executor/task_node.h    |   2 +-
 paddle/fluid/inference/api/analysis_config.cc |   3 +
 .../fluid/inference/api/analysis_predictor.cc | 289 +++++++++++++++++-
 .../fluid/inference/api/analysis_predictor.h  |  59 ++++
 .../inference/api/paddle_analysis_config.h    |  57 ++++
 .../fluid/inference/tests/api/CMakeLists.txt  |   6 +
 .../tests/api/analyzer_dist_model_tester.cc   |  72 +++++
 paddle/fluid/pybind/bind_fleet_executor.cc    |   2 +-
 paddle/fluid/pybind/inference_api.cc          |  19 +-
 python/paddle/fluid/executor.py               |   5 +-
 15 files changed, 581 insertions(+), 33 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 56d8da3eca4b5..0d5d328fd32cc 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <algorithm>
+
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -46,7 +48,8 @@ void Carrier::Init(
     const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
     const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
     const framework::ProgramDesc& program, framework::Scope* scope,
-    int64_t num_micro_batches, const platform::Place& place) {
+    int64_t num_micro_batches, const platform::Place& place,
+    const std::vector<std::string>& inference_root_scope_vars) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
@@ -60,7 +63,7 @@ void Carrier::Init(
   microbatch_scopes_.resize(num_micro_batches);
   for (int i = 0; i < num_micro_batches; ++i) {
     microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program);
+    CopyParameters(i, program, inference_root_scope_vars);
   }
 
   // TODO(fleet_exe dev): thread pool
@@ -80,12 +83,23 @@ void Carrier::Release() {
 
 Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; }
 
-void Carrier::CopyParameters(int microbatch_id,
-                             const framework::ProgramDesc& program) {
+void Carrier::CopyParameters(
+    int microbatch_id, const framework::ProgramDesc& program,
+    const std::vector<std::string>& inference_root_scope_vars) {
   auto& global_block = program.Block(0);
 
+  std::map<std::string, int> inference_root_scope_var_map;
+  for (auto var_name : inference_root_scope_vars) {
+    inference_root_scope_var_map.insert({var_name, 1});
+  }
   for (auto& var : global_block.AllVars()) {
-    if (var->Persistable() && microbatch_id == 0) {
+    std::string var_name = var->Name();
+    bool force_root = inference_root_scope_var_map.find(var_name) !=
+                      inference_root_scope_var_map.end();
+    if (force_root) {
+      VLOG(4) << var_name << " will be forced to be created in the root scope.";
+    }
+    if ((var->Persistable() || force_root) && microbatch_id == 0) {
       auto* ptr = root_scope_->Var(var->Name());
       InitializeVariable(ptr, var->GetType());
       VLOG(5) << "Create persistable var: " << var->Name()
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index 9a74fa78c0e76..d35a3260915e2 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -57,9 +57,12 @@ class Carrier final {
       const std::unordered_map<int64_t, int64_t>& interceptor_id_to_rank,
       const std::unordered_map<int64_t, TaskNode*>& interceptor_id_to_node,
       const framework::ProgramDesc& program, framework::Scope* scope,
-      int64_t num_micro_batches, const platform::Place& place);
+      int64_t num_micro_batches, const platform::Place& place,
+      const std::vector<std::string>& inference_root_scope_vars = {});
 
-  void CopyParameters(int microbatch_id, const framework::ProgramDesc& program);
+  void CopyParameters(
+      int microbatch_id, const framework::ProgramDesc& program,
+      const std::vector<std::string>& inference_root_scope_vars);
 
   void Release();
   void Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index 457549a27b4b7..e946d78550ff1 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <algorithm>
 
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/global.h"
@@ -52,7 +53,8 @@ void FleetExecutor::Init(
     const std::string& carrier_id, const framework::ProgramDesc& program_desc,
     framework::Scope* scope, const platform::Place& place,
     int64_t num_micro_batches, const std::vector<TaskNode*>& task_nodes,
-    const std::unordered_map<int64_t, int64_t>& task_id_to_rank) {
+    const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+    const std::vector<std::string>& inference_root_scope_vars) {
   PADDLE_ENFORCE_GT(task_nodes.size(), 0,
                     platform::errors::InvalidArgument(
                         "Fleet executor is inited with empty task node"));
@@ -64,6 +66,37 @@ void FleetExecutor::Init(
     }
   }
   auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
+  // NOTE: For inference, the vars in inference_root_scope_vars
+  // shouldn't be deleted during inf, for that they may be the result of the
+  // inf. If they are GCed, it will cause error during ZeroCopy the result.
+  std::vector<const framework::OperatorBase*> changed_ops;
+  for (auto pair : unused_vars) {
+    const framework::OperatorBase* op = pair.first;
+    std::vector<std::string> unused = pair.second;
+    for (auto name : inference_root_scope_vars) {
+      auto iter = std::find(unused.begin(), unused.end(), name);
+      if (iter != unused.end()) {
+        VLOG(3) << "Removing var: [" << name
+                << "] from the unused vars list of op: [" << op->Type() << "]";
+        unused.erase(iter);
+        if (std::find(changed_ops.begin(), changed_ops.end(), op) ==
+            changed_ops.end()) {
+          // record the op whose unused vars have been updated
+          changed_ops.emplace_back(op);
+        }
+      }
+    }
+    // update the unused vars list in the map
+    unused_vars[op] = unused;
+  }
+  for (auto op : changed_ops) {
+    auto iter = unused_vars.find(op);
+    if (iter->second.empty()) {
+      // remove those ops in the map that have empty unused vars list
+      VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map.";
+      unused_vars.erase(iter);
+    }
+  }
   runtime_graph_ = std::make_shared<RuntimeGraph>();
   std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
   for (auto task_node : task_nodes) {
@@ -82,17 +115,18 @@ void FleetExecutor::Init(
   carrier_ids_.insert(carrier_id);
   // Set current running carrier
   GlobalVal<std::string>::Set(new std::string(carrier_id));
-  InitCarrier(carrier, scope, place, num_micro_batches, program_desc);
+  InitCarrier(carrier, scope, place, num_micro_batches, program_desc,
+              inference_root_scope_vars);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
-void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope,
-                                const platform::Place& place,
-                                int64_t num_micro_batches,
-                                const framework::ProgramDesc& program_desc) {
+void FleetExecutor::InitCarrier(
+    Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+    int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+    const std::vector<std::string>& inference_root_scope_vars) {
   carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(), program_desc, scope,
-                num_micro_batches, place);
+                num_micro_batches, place, inference_root_scope_vars);
 }
 
 void FleetExecutor::InitMessageBus() {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index fa65309127bec..ccdb3dcc45948 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -42,15 +42,17 @@ class FleetExecutor final {
             const framework::ProgramDesc& program_desc, framework::Scope* scope,
             const platform::Place& place, int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
-            const std::unordered_map<int64_t, int64_t>& task_id_to_rank);
+            const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+            const std::vector<std::string>& inference_root_scope_vars = {});
   void Run(const std::string& carrier_id);
 
  private:
   DISABLE_COPY_AND_ASSIGN(FleetExecutor);
   void InitMessageBus();
-  void InitCarrier(Carrier* carrier, framework::Scope* scope,
-                   const platform::Place& place, int64_t num_micro_batches,
-                   const framework::ProgramDesc& program_desc);
+  void InitCarrier(
+      Carrier* carrier, framework::Scope* scope, const platform::Place& place,
+      int64_t num_micro_batches, const framework::ProgramDesc& program_desc,
+      const std::vector<std::string>& inference_root_scope_vars = {});
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
   std::unordered_set<std::string> carrier_ids_;
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 6de7038b3231f..95e4c73305998 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
   program_ = program;
 }
 
-void TaskNode::Init() {
+void TaskNode::Init(bool use_feed_fetch_ops) {
+  if (!use_feed_fetch_ops) {
+    VLOG(3) << "TaskNode will be inited without feed and fetch ops";
+  }
   if (ops_.empty()) {
     // Q (for fleet executor dev): should we need another reset funct?
     VLOG(3) << "Task node will be inited by calling Init().";
     for (const auto& op_desc : program_->Block(0).AllOps()) {
+      if (!use_feed_fetch_ops &&
+          (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+        VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], "
+                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
+        continue;
+      }
       ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc));
     }
     for (const auto& op : ops_vec_) {
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index b655d140d37a5..4764d4fd4af87 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -46,7 +46,7 @@ class TaskNode final {
   ~TaskNode() = default;
 
   void SetProgram(paddle::framework::ProgramDesc* program);
-  void Init();
+  void Init(bool use_feed_fetch_ops = true);
   int64_t rank() const { return rank_; }
   int64_t task_id() const { return task_id_; }
   int32_t role() const { return role_; }
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index fd2ccffae3b4a..9c33d70030645 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -274,6 +274,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(ipu_available_memory_proportion_);
   CP_MEMBER(ipu_enable_half_partial_);
 
+  // fleet exe related
+  CP_MEMBER(dist_config_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_, false,
                       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index cd6e3a3c759c0..5492c3b0d2645 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -30,6 +30,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/version.h"
@@ -47,6 +48,14 @@
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
+#include "paddle/utils/string/split.h"
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#endif
 
 #ifdef PADDLE_WITH_MKLML
 #include "paddle/fluid/platform/dynload/mklml.h"
@@ -186,14 +195,14 @@ bool AnalysisPredictor::Init(
     return false;
   }
 
+  // Get the feed_target_names and fetch_target_names
+  PrepareFeedFetch();
+
   // Prepare executor, create local variables.
   if (!PrepareExecutor()) {
     return true;
   }
 
-  // Get the feed_target_names and fetch_target_names
-  PrepareFeedFetch();
-
   return true;
 }
 
@@ -359,6 +368,13 @@ static void DisablePrepareDataOpt(
 }
 
 bool AnalysisPredictor::PrepareExecutor() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "use_dist_model is enabled, will init FleetExecutor.";
+    return PrepareFleetExecutor();
+  }
+#endif
   DisablePrepareDataOpt(inference_program_, 0, false);
 
   executor_->Prepare(sub_scope_, *inference_program_, 0,
@@ -371,6 +387,226 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+bool AnalysisPredictor::PrepareFleetExecutor() {
+  VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()";
+  if (config_.dist_config().nranks() > 1 && !CommInit()) {
+    return false;
+  }
+  task_node_.reset(new distributed::TaskNode(inference_program_.get(),
+                                             config_.dist_config().rank()));
+  // With auto cut, there is no concept of pp, no need to add dependency.
+  task_node_->SetType("Compute");
+  task_node_->Init(config_.use_feed_fetch_ops_enabled());
+  executor_desc_ = distributed::FleetExecutorDesc();
+  executor_desc_.set_cur_rank(config_.dist_config().rank());
+  std::unordered_map<int64_t, int64_t> id_to_rank;
+  for (int i = 0; i < config_.dist_config().nranks(); ++i) {
+    distributed::RankInfo *rank_info = executor_desc_.add_cluster_info();
+    rank_info->set_rank(i);
+    rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]);
+    id_to_rank.insert({i, i});
+  }
+  fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_));
+  // NOTE: Vars of feed fetch ops are not persistable,
+  // which will result in that those vars will be created in
+  // the subscope (microscope) in fleet executor. This will
+  // cause that the GetInputTensor/GetOutputTensor funct
+  // in analysis predictor cannot find those vars in the scope
+  // returned by the DistModel, since DistModel only return the
+  // root scope. So, those vars must  to be created in the root
+  // scope instead of in the microscope
+  std::vector<std::string> feed_fetch_vars;
+  for (auto pair : idx2feeds_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  for (auto pair : idx2fetches_) {
+    feed_fetch_vars.emplace_back(pair.second);
+  }
+  fleet_exe_->Init(config_.dist_config().carrier_id(),
+                   *(inference_program_.get()), scope_.get(), place_, 1,
+                   {task_node_.get()}, id_to_rank, feed_fetch_vars);
+  return true;
+}
+
+bool AnalysisPredictor::CommInit() {
+  std::map<int64_t, std::vector<int64_t>> ring_id_to_ranks{};
+  std::map<int64_t, std::vector<int64_t>> rank_to_ring_ids{};
+  if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) {
+    VLOG(3) << "Load converter config failed, DistModel init failed.";
+    return false;
+  }
+  std::unique_ptr<framework::ProgramDesc> comm_init_program(
+      new framework::ProgramDesc());
+  framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0);
+  std::vector<int64_t> &ring_ids =
+      rank_to_ring_ids[config_.dist_config().rank()];
+  int64_t order = 0;
+  std::string var_name_base = "comm_init_";
+  for (int64_t ring_id : ring_ids) {
+    VLOG(3) << "Init comm for ring id: " << ring_id;
+    int64_t ranks_in_group = ring_id_to_ranks[ring_id].size();
+    int64_t rank_in_group = 0;
+    std::vector<int64_t> &ranks = ring_id_to_ranks[ring_id];
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        break;
+      }
+      rank_in_group += 1;
+    }
+    std::vector<std::string> peer_endpoints;
+    for (int64_t rank : ranks) {
+      if (config_.dist_config().rank() == rank) {
+        continue;
+      }
+      peer_endpoints.emplace_back(
+          config_.dist_config().trainer_endpoints()[rank]);
+    }
+    InsertCommOp(var_name_base + std::to_string(order), ranks_in_group,
+                 rank_in_group, peer_endpoints, comm_init_block, ring_id);
+    order += 1;
+  }
+  framework::NaiveExecutor e(place_);
+  e.CreateVariables(*comm_init_program, 0, true, scope_.get());
+  e.Prepare(scope_.get(), *comm_init_program, 0, false);
+  e.Run();
+  VLOG(3) << "Comm init successful.";
+  return true;
+}
+
+void AnalysisPredictor::InsertCommOp(
+    std::string tmp_var_name, int nranks, int rank,
+    const std::vector<std::string> &peer_endpoints, framework::BlockDesc *block,
+    int ring_id) {
+  /*
+   * tmp_var_name: the var name for var comm_id
+   * nranks: number of total ranks
+   * rank: the rank of local rank in the comm group
+   * peer_endpoints: peer's endpoints
+   * block: the block where to insert the comm ops
+   * ring_id: the ring_id to be inited
+   */
+  const std::string &endpoint = config_.dist_config().current_endpoint();
+  std::stringstream ss;
+  ss << "Init comm with tmp var: " << tmp_var_name
+     << ". The ring id is: " << ring_id << ". The group has: " << nranks
+     << " ranks. Current rank in the group is: " << rank
+     << ". The endpoint is: " << endpoint << ". Peer endpoints are: ";
+  for (auto ep : peer_endpoints) {
+    ss << ep << ", ";
+  }
+  VLOG(3) << ss.str();
+  if (config_.use_gpu()) {
+    framework::VarDesc *new_var = block->Var(tmp_var_name);
+    new_var->SetType(framework::proto::VarType::RAW);
+    new_var->SetPersistable(true);
+    framework::OpDesc *gen_nccl_id_op = block->AppendOp();
+    gen_nccl_id_op->SetType("c_gen_nccl_id");
+    gen_nccl_id_op->SetOutput("Out", {tmp_var_name});
+    gen_nccl_id_op->SetAttr("rank", rank);
+    gen_nccl_id_op->SetAttr("endpoint",
+                            config_.dist_config().current_endpoint());
+    gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints);
+    gen_nccl_id_op->SetAttr("ring_id", ring_id);
+    gen_nccl_id_op->SetAttr("op_role",
+                            static_cast<int>(framework::OpRole::kForward));
+    gen_nccl_id_op->CheckAttrs();
+    framework::OpDesc *comm_init_op = block->AppendOp();
+    comm_init_op->SetType("c_comm_init");
+    comm_init_op->SetInput("X", {tmp_var_name});
+    comm_init_op->SetAttr("rank", rank);
+    comm_init_op->SetAttr("nranks", nranks);
+    comm_init_op->SetAttr("ring_id", ring_id);
+    comm_init_op->SetAttr("op_role",
+                          static_cast<int>(framework::OpRole::kForward));
+    comm_init_op->CheckAttrs();
+  } else {
+    LOG(WARNING) << "DistModelInf doesn't init comm.";
+    // TODO(fleet exe dev): comm init for more devices
+  }
+}
+
+bool AnalysisPredictor::LoadConverterConfig(
+    std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+    std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids) {
+  VLOG(3) << "Going to load converter config from: "
+          << config_.dist_config().comm_init_config() << "\n";
+  std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin.is_open()), true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file is normal.",
+          config_.dist_config().comm_init_config()));
+  std::string line;
+  bool ring_to_rank{true};
+  // Reading config from file, the config file should like these format
+  //  [ring_id -> ranks]
+  //  0,0,1,2,3
+  //  1,0,1
+  //  2,2,3
+  //  21,0,1
+  //  22,1,2
+  //  23,2,3
+  //  [rank -> ring_ids]
+  //  0,0,1,21
+  //  1,0,1,21,22
+  //  2,0,2,22,23
+  //  3,0,2,23
+  while (std::getline(fin, line)) {
+    std::vector<std::string> one_line = paddle::string::Split(line, ',');
+    if (one_line.size() == 1) {
+      // start a new section of the config
+      if (line == "[ring_id -> ranks]") {
+        ring_to_rank = true;
+      } else if (line == "[rank -> ring_ids]") {
+        ring_to_rank = false;
+      }
+    } else {
+      // parse key - values pairs in one section
+      int64_t key = std::stoll(one_line[0]);
+      for (size_t i = 1; i < one_line.size(); ++i) {
+        int64_t val = std::stoll(one_line[i]);
+        if (ring_to_rank) {
+          if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) {
+            ring_id_to_ranks->insert({key, std::vector<int64_t>()});
+          }
+          ring_id_to_ranks->at(key).emplace_back(val);
+        } else {
+          if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) {
+            rank_to_ring_ids->insert({key, std::vector<int64_t>()});
+          }
+          rank_to_ring_ids->at(key).emplace_back(val);
+        }
+        // NOTE: add more configuration sections here
+      }
+    }
+  }
+  std::stringstream ss;
+  ss << "Loaded the following converter config:\n";
+  ss << "ring_id_to_ranks:\n";
+  for (auto pair : *ring_id_to_ranks) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  ss << "rank_to_ring_ids:\n";
+  for (auto pair : *rank_to_ring_ids) {
+    int64_t key = pair.first;
+    ss << "\t" << key << "\t->\t";
+    for (auto value : pair.second) {
+      ss << value << "\t";
+    }
+    ss << "\n";
+  }
+  VLOG(3) << ss.str();
+  return true;
+}
+#endif
+
 void AnalysisPredictor::MkldnnPreSet(const std::vector<PaddleTensor> &inputs) {
 #ifdef PADDLE_WITH_MKLDNN
   std::vector<std::vector<int>> inputs_shape;
@@ -946,13 +1182,24 @@ std::vector<std::string> AnalysisPredictor::GetOutputNames() {
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "The variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -985,13 +1232,24 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
 
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
     const std::string &name) {
+  framework::Scope *scope;
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    scope = scope_.get();
+  } else {
+    scope = executor_->scope();
+  }
+#else
+  scope = executor_->scope();
+#endif
   PADDLE_ENFORCE_NOT_NULL(
-      executor_->scope()->FindVar(name),
+      scope->FindVar(name),
       platform::errors::PreconditionNotMet(
-          "he variable named %s is not found in the scope of the exector.",
+          "The variable named %s is not found in the scope of the executor.",
           name));
   std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
+      new ZeroCopyTensor(static_cast<void *>(scope)));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -1023,6 +1281,18 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 }
 
 bool AnalysisPredictor::ZeroCopyRun() {
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  if (config_.dist_config().use_dist_model()) {
+    VLOG(3) << "ZeroCopyRun will use the fleet executor.";
+    inference::Timer timer;
+    timer.tic();
+    fleet_exe_->Run(config_.dist_config().carrier_id());
+    VLOG(3) << "Fleet executor inf runs once use: "
+            << std::to_string(timer.toc()) << "ms";
+    return true;
+  }
+#endif
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.use_mkldnn_) {
@@ -1035,7 +1305,6 @@ bool AnalysisPredictor::ZeroCopyRun() {
     MkldnnPreSet(shape_vector);
   }
 #endif
-
   executor_->Run();
 
   if (config_.shape_range_info_collected()) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index a8e56101d37da..8ed183dae0b1b 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -18,6 +18,10 @@
 #include <memory>
 #include <string>
 #include <vector>
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
+#endif
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_compatible_info.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
@@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor {
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet exe related
+
+  ///
+  /// \brief prepare for fleet executor to run
+  ///
+  /// Used in AnalysisPredictor::Init(),
+  ///
+  bool PrepareFleetExecutor();
+
+  ///
+  /// \brief init NCCL env for multi gpus inference
+  ///
+  /// Used in AnalysisPredictor::PrepareFleetExecutor()
+  ///
+  bool CommInit();
+
+  ///
+  /// \brief read the config to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks
+  /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids
+  ///
+  bool LoadConverterConfig(
+      std::map<int64_t, std::vector<int64_t>> *ring_id_to_ranks,
+      std::map<int64_t, std::vector<int64_t>> *rank_to_ring_ids);
+
+  ///
+  /// \brief add ops and run them with NaiveExecutor to init NCCL env
+  ///
+  /// Used in AnalysisPredictor::CommInit()
+  ///
+  /// \param[in] tmp_var_name: var name to hold NCCL unique id
+  /// \param[in] nranks: number of ranks in one comm group
+  /// \param[in] rank: relative rank of current rank in the comm group
+  /// \param[in] peer_endpoints: group's peers' endpoints
+  /// \param[in] block: the block to insert comm ops
+  /// \param[in] ring_id: the ring id to be used to init NCCL env
+  ///
+  void InsertCommOp(std::string tmp_var_name, int nranks, int rank,
+                    const std::vector<std::string> &peer_endpoints,
+                    framework::BlockDesc *block, int ring_id);
+#endif
+
  private:
   AnalysisConfig config_;
   Argument argument_;
@@ -436,6 +487,14 @@ class AnalysisPredictor : public PaddlePredictor {
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   int clone_num_{1};
+
+#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \
+    !defined(PADDLE_WITH_ASCEND_CL)
+  // fleet executor related
+  distributed::FleetExecutorDesc executor_desc_;
+  std::shared_ptr<distributed::FleetExecutor> fleet_exe_;
+  std::shared_ptr<distributed::TaskNode> task_node_;
+#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 180c028c6a610..b4a358394404f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -76,6 +76,54 @@ struct LiteNNAdapterConfig {
   LiteNNAdapterConfig& Disable();
 };
 
+struct DistConfig {
+  bool use_dist_model() const { return use_dist_model_; }
+  void EnableDistModel(bool use_dist_model) {
+    use_dist_model_ = use_dist_model;
+  }
+
+  std::vector<std::string> trainer_endpoints() const {
+    return trainer_endpoints_;
+  }
+
+  std::string current_endpoint() const { return current_endpoint_; }
+
+  void SetEndpoints(const std::vector<std::string>& trainer_endpoints,
+                    const std::string& current_endpoint) {
+    trainer_endpoints_ = trainer_endpoints;
+    current_endpoint_ = current_endpoint;
+  }
+
+  int64_t nranks() const { return nranks_; }
+
+  int64_t rank() const { return rank_; }
+
+  void SetRanks(int64_t nranks, int64_t rank) {
+    nranks_ = nranks;
+    rank_ = rank;
+  }
+
+  std::string comm_init_config() const { return comm_init_config_; }
+
+  void SetCommInitConfig(const std::string& comm_init_config) {
+    comm_init_config_ = comm_init_config;
+  }
+
+  void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; }
+
+  std::string carrier_id() const { return carrier_id_; }
+
+ protected:
+  // DistModel Inference related
+  bool use_dist_model_{false};  // whether use DistModel or not
+  std::vector<std::string> trainer_endpoints_{};  // all trainers' endpoints
+  std::string current_endpoint_{};                // current trainer's endpoint
+  int64_t nranks_{1};               // total ranks (number of trainers)
+  int64_t rank_{0};                 // rank
+  std::string comm_init_config_{};  // converter config path
+  std::string carrier_id_{"inference"};
+};
+
 ///
 /// \brief configuration manager for AnalysisPredictor.
 /// \since 1.7.0
@@ -763,6 +811,12 @@ struct PD_INFER_DECL AnalysisConfig {
 
   LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; }
 
+  void SetDistConfig(const DistConfig& dist_config) {
+    dist_config_ = dist_config;
+  }
+
+  const DistConfig& dist_config() const { return dist_config_; }
+
  protected:
   // Update the config.
   void Update();
@@ -902,6 +956,9 @@ struct PD_INFER_DECL AnalysisConfig {
   mutable bool is_valid_{true};
   std::string opt_cache_dir_;
   friend class paddle_infer::experimental::InternalUtils;
+
+  // fleet exe related
+  DistConfig dist_config_{};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 0281fd917658a..8c96499a022f7 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -720,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
 
+if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+            ARGS --infer_model=${OCR_INSTALL_DIR}/model)
+endif()
+
 inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
new file mode 100644
index 0000000000000..7cf6e2adfc688
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+
+namespace paddle {
+namespace inference {
+
+TEST(test_dist_model, dist_model) {
+  std::cout << "Analysis Predictor DistModel test." << std::endl;
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model + "/__model__",
+                  FLAGS_infer_model + "/__params__");
+  config.SwitchUseFeedFetchOps(false);
+  config.EnableUseGpu(100, 0);
+  DistConfig dist_config;
+  dist_config.SetRanks(1, 0);
+  dist_config.EnableDistModel(true);
+  dist_config.SetEndpoints({""}, "");
+  config.SetDistConfig(dist_config);
+
+  auto predictor = paddle_infer::CreatePredictor(config);
+  int batch_size = 1;
+  int channels = 1;
+  int height = 48;
+  int width = 512;
+  int nums = batch_size * channels * height * width;
+  std::cout << "Created predictor." << std::endl;
+
+  float* input = new float[nums];
+  for (int i = 0; i < nums; ++i) input[i] = 0;
+  auto input_names = predictor->GetInputNames();
+
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape({batch_size, channels, height, width});
+  input_t->CopyFromCpu(input);
+  std::cout << "Input data." << std::endl;
+
+  predictor->Run();
+  std::cout << "Zero Copy Run." << std::endl;
+
+  std::vector<float> out_data;
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data.resize(out_num);
+  output_t->CopyToCpu(out_data.data());
+  std::cout << "Output data." << std::endl;
+  delete[] input;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index b29cc10e8f56f..8491d1e224930 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -168,7 +168,7 @@ void BindFleetExecutor(py::module* m) {
       .def("set_run_at_offset", &TaskNode::SetRunAtOffset)
       .def("set_type", &TaskNode::SetType)
       .def("role", &TaskNode::role)
-      .def("init", &TaskNode::Init)
+      .def("init", [](TaskNode& self) { self.Init(); })
       .def("set_program", &TaskNode::SetProgram);
 
   py::class_<DistModelConfig>(*m, "DistModelConfig")
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index eafd5baab7d24..9b5041154c95a 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -658,7 +658,24 @@ void BindAnalysisConfig(py::module *m) {
              return dynamic_cast<PaddlePassBuilder *>(self.pass_builder());
            },
            py::return_value_policy::reference)
-      .def("nnadapter", &AnalysisConfig::NNAdapter);
+      .def("nnadapter", &AnalysisConfig::NNAdapter)
+      .def("set_dist_config", &AnalysisConfig::SetDistConfig)
+      .def("dist_config", &AnalysisConfig::dist_config);
+
+  py::class_<DistConfig>(*m, "DistConfig")
+      .def(py::init<>())
+      .def("set_carrier_id", &DistConfig::SetCarrierId)
+      .def("set_comm_init_config", &DistConfig::SetCommInitConfig)
+      .def("set_endpoints", &DistConfig::SetEndpoints)
+      .def("set_ranks", &DistConfig::SetRanks)
+      .def("enable_dist_model", &DistConfig::EnableDistModel)
+      .def("carrier_id", &DistConfig::carrier_id)
+      .def("current_endpoint", &DistConfig::current_endpoint)
+      .def("trainer_endpoints", &DistConfig::trainer_endpoints)
+      .def("nranks", &DistConfig::nranks)
+      .def("rank", &DistConfig::rank)
+      .def("comm_init_config", &DistConfig::comm_init_config)
+      .def("use_dist_model", &DistConfig::use_dist_model);
 }
 
 void BindLiteNNAdapterConfig(py::module *m) {
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e372727b0f0b6..a7971763f53e1 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -2034,8 +2034,11 @@ def _prepare_fleet_executor_carrier(self,
             fleet_opt['task_id_to_rank'] = task_id_to_rank
         place = core.Place()
         place.set_place(self.place)
+        # NOTE: the last argument is used to force create some vars in root scope,
+        # won't be used during train.
         self._fleet_executor.init(carrier_id, program.desc, scope, place,
-                                  num_micro_batches, tasks, task_id_to_rank)
+                                  num_micro_batches, tasks, task_id_to_rank,
+                                  [])
 
     def _run_using_fleet_executor(self,
                                   program=None,

From bc113e10487115fd91cfc738c4279372eeb7c2a2 Mon Sep 17 00:00:00 2001
From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com>
Date: Wed, 2 Mar 2022 15:29:24 +0800
Subject: [PATCH 07/41] add logic kernel for mlu (#39940)

---
 .../operators/controlflow/compare_op_mlu.cc   | 200 ++++++++++++++++++
 .../unittests/mlu/test_compare_op_mlu.py      | 157 ++++++++++++++
 2 files changed, 357 insertions(+)
 create mode 100644 paddle/fluid/operators/controlflow/compare_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py

diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
new file mode 100644
index 0000000000000..9dc287ab76a67
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class EqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NotEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LessEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterThanMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GreaterEqualMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::LoDTensor>("Y");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<bool>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(x->dtype()));
+    MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY,
+                              ToCnnlDataType(y->dtype()));
+    MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY,
+                             ToCnnlDataType(out->dtype()));
+    MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x),
+                   input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(
+    equal, ops::EqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::EqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    not_equal, ops::NotEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::NotEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_than, ops::LessThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    less_equal, ops::LessEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::LessEqualMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_than,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterThanMLUKernel<plat::MLUDeviceContext, bool>);
+
+REGISTER_OP_MLU_KERNEL(
+    greater_equal,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, plat::float16>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, float>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, uint8_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int16_t>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, int>,
+    ops::GreaterEqualMLUKernel<plat::MLUDeviceContext, bool>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
new file mode 100644
index 0000000000000..87997acce02a3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(OpTest):
+        def setUp(self):
+            self.set_mlu()
+            self.place = paddle.MLUPlace(0)
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            out = callback(x, y)
+            self.inputs = {'X': x, 'Y': y}
+            self.outputs = {'Out': out}
+            self.op_type = op_type
+
+        def set_mlu(self):
+            self.__class__.use_mlu = True
+
+        def test_output(self):
+            self.check_output_with_place(place=self.place)
+
+        def test_errors(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                a = fluid.layers.data(name='a', shape=[2], dtype='float32')
+                b = fluid.layers.data(name='b', shape=[2], dtype='float32')
+                c = fluid.layers.data(name='c', shape=[2], dtype='int16')
+                d = fluid.create_lod_tensor(np.array([[-1]]), [[1]], self.place)
+
+                op = eval("fluid.layers.%s" % self.op_type)
+                self.assertRaises(TypeError, op, x=a, y=b, axis=True)
+                self.assertRaises(TypeError, op, x=a, y=b, force_cpu=1)
+                self.assertRaises(TypeError, op, x=a, y=b, cond=1)
+                self.assertRaises(TypeError, op, x=a, y=c)
+                self.assertRaises(TypeError, op, x=c, y=a)
+                self.assertRaises(TypeError, op, x=a, y=d)
+                self.assertRaises(TypeError, op, x=d, y=a)
+                self.assertRaises(TypeError, op, x=c, y=d)
+
+        def test_dynamic_api(self):
+            paddle.disable_static()
+            paddle.set_device('mlu:0')
+            x = np.random.random(size=(10, 7)).astype(typename)
+            y = np.random.random(size=(10, 7)).astype(typename)
+            real_result = callback(x, y)
+            x = paddle.to_tensor(x, dtype=typename)
+            y = paddle.to_tensor(y, dtype=typename)
+            op = eval("paddle.%s" % (self.op_type))
+            out = op(x, y)
+            self.assertEqual((out.numpy() == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_1(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 1, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(
+                    name='x', shape=[1, 2, 3], dtype=typename)
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_broadcast_api_3(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[5], dtype=typename)
+                y = paddle.static.data(name='y', shape=[3, 1], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 5).reshape((5)).astype(typename)
+                input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
+        @unittest.skipIf(typename == 'float16', "float16 is not supported now")
+        def test_attr_name(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = fluid.layers.data(name='x', shape=[4], dtype=typename)
+                y = fluid.layers.data(name='y', shape=[4], dtype=typename)
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x=x, y=y, name="name_%s" % (self.op_type))
+            self.assertEqual("name_%s" % (self.op_type) in out.name, True)
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float16', 'float32', 'int32', 'bool'}:
+    if _type_name == 'int32' or _type_name == 'bool':
+        create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+        continue
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+    create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b)
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
+
+if __name__ == '__main__':
+    unittest.main()

From 0c3f7fbcfe68bfb34b0ed5d9aad6e3a8c0cca43f Mon Sep 17 00:00:00 2001
From: chenjian <chenjian26@baidu.com>
Date: Wed, 2 Mar 2022 15:30:09 +0800
Subject: [PATCH 08/41] Upgrade new profiler (#39984)

* add new profiler components

* fix bug

* upgrade new profiler

* fix operator.cc

* fix operator.cc

* fix cmakelists.txt

* fix bug

* fix according to pr

* fix bug

* fix cmake

* fix bug

* fix a bug

* fix bug

* fix bug
---
 paddle/fluid/framework/operator.cc            |   8 +-
 paddle/fluid/platform/profiler/CMakeLists.txt |  10 +-
 .../platform/profiler/chrometracing_logger.cc | 320 ++++++++++++++----
 .../platform/profiler/chrometracing_logger.h  |  11 +
 .../platform/profiler/cpu_utilization.cc      |  47 ++-
 .../platform/profiler/dump/CMakeLists.txt     |   3 -
 .../profiler/dump/deserialization_reader.cc   |  16 +-
 .../profiler/dump/deserialization_reader.h    |   4 +-
 .../platform/profiler/dump/nodetree.proto     |  27 +-
 .../profiler/dump/serialization_logger.cc     |  12 +
 .../profiler/dump/serialization_logger.h      |   5 +
 .../dump/test_serialization_logger.cc         |  28 +-
 .../fluid/platform/profiler/event_python.cc   | 122 +++++++
 paddle/fluid/platform/profiler/event_python.h |  26 +-
 paddle/fluid/platform/profiler/profiler.cc    |  35 +-
 paddle/fluid/platform/profiler/profiler.h     |  10 +-
 .../fluid/platform/profiler/profiler_test.cc  |  11 +-
 paddle/fluid/platform/profiler/trace_event.h  |   2 +
 18 files changed, 578 insertions(+), 119 deletions(-)
 mode change 100755 => 100644 paddle/fluid/platform/profiler/dump/serialization_logger.h
 create mode 100644 paddle/fluid/platform/profiler/event_python.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b12ad552aba6e..b91ee3c2d633d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
       // and different op name cost time,we set two event.
       platform::RecordEvent op_type_record_event(
           Type(), platform::TracerEventType::Operator, 1);
-      // auto op_name = platform::OpName(outputs_, Type());
-      // platform::RecordEvent op_name_record_event(
-      //     op_name, platform::TracerEventType::Operator, 1,
-      //     platform::EventRole::kUniqueOp);
+      auto op_name = platform::OpName(outputs_, Type());
+      platform::RecordEvent op_name_record_event(
+          op_name, platform::TracerEventType::Operator, 10,
+          platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
     }
 
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 5acdfa39569f0..c903a52530ccb 100755
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
 cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
 cc_library(event_node SRCS event_node.cc DEPS enforce)
 cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
-cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils)
-cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger)
 add_subdirectory(dump)
+cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
+cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
 cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
+cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind)
+cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
 cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node)
+cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
+cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 7b207ea7b2011..4061e2d4d494d 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -18,40 +18,17 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/os_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/chrometracing_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
 
 static const char* kSchemaVersion = "1.0.0";
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json";
-static uint32_t num_span = 0;
-
-static int64_t nsToUs(int64_t ns) { return ns / 1000; }
-
-template <typename... Args>
-std::string string_format(const std::string& format, Args... args) {
-  int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) +
-               1;  // Extra space for '\0'
-  PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal(
-                                   "Error during profiler data formatting."));
-  auto size = static_cast<size_t>(size_s);
-  auto buf = std::make_unique<char[]>(size);
-  std::snprintf(buf.get(), size, format.c_str(), args...);
-  return std::string(buf.get(), size - 1);  // exclude the '\0'
-}
-
-std::string GetStringFormatLocalTime() {
-  std::time_t rawtime;
-  std::tm* timeinfo;
-  char buf[100];
-  std::time(&rawtime);
-  timeinfo = std::localtime(&rawtime);
-  std::strftime(buf, 100, "%F-%X", timeinfo);
-  return std::string(buf);
-}
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -60,16 +37,19 @@ static std::string DefaultFileName() {
 }
 
 const char* ChromeTracingLogger::categary_name_[] = {
-    "operator", "dataloader", "profile_step", "cuda_runtime", "kernel",
-    "memcpy",   "memset",     "user_defined", "others"};
+    "Operator",      "Dataloader", "ProfileStep",      "CudaRuntime",
+    "Kernel",        "Memcpy",     "Memset",           "UserDefined",
+    "OperatorInner", "Forward",    "Backward",         "Optimization",
+    "Communication", "PythonOp",   "PythonUserDefined"};
 
 void ChromeTracingLogger::OpenFile() {
   output_file_stream_.open(filename_,
                            std::ofstream::out | std::ofstream::trunc);
   if (!output_file_stream_) {
-    VLOG(2) << "Unable to open file for writing profiling data." << std::endl;
+    LOG(WARNING) << "Unable to open file for writing profiling data."
+                 << std::endl;
   } else {
-    VLOG(0) << "writing profiling data to " << filename_ << std::endl;
+    LOG(INFO) << "writing profiling data to " << filename_ << std::endl;
   }
 }
 
@@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode(
   if (!output_file_stream_) {
     return;
   }
-  output_file_stream_ << string_format(
-      std::string(
-          R"JSON(
+  switch (host_node.Type()) {
+    case TracerEventType::ProfileStep:
+    case TracerEventType::Forward:
+    case TracerEventType::Backward:
+    case TracerEventType::Dataloader:
+    case TracerEventType::Optimization:
+    case TracerEventType::PythonOp:
+    case TracerEventType::PythonUserDefined:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(Python)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
-      host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
-      nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
-      categary_name_[static_cast<int>(host_node.Type())]);
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+    default:
+      output_file_stream_ << string_format(
+          std::string(
+              R"JSON(
+  { 
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, "dur": %lld,
+    "ph": "X", "cat": "%s", 
+    "args": {
+      "start_ns": %lld,
+      "end_ns": %lld
+    }
+  },
+  )JSON"),
+          host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(),
+          nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()),
+          categary_name_[static_cast<int>(host_node.Type())],
+          host_node.StartNs(), host_node.EndNs());
+      break;
+  }
+
+  pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogRuntimeTraceEventNode(
@@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "%s", "pid": %lld, "tid": %lld,
+    "name": "%s", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
-      "correlation id": %d
+      "correlation id": %d,
+      "start_ns": %lld,
+      "end_ns": %lld
     }
   },
   )JSON"),
@@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode(
       runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()),
       nsToUs(runtime_node.Duration()),
       categary_name_[static_cast<int>(runtime_node.Type())],
-      runtime_node.CorrelationId());
+      runtime_node.CorrelationId(), runtime_node.StartNs(),
+      runtime_node.EndNs());
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
+
+  output_file_stream_ << string_format(
+      std::string(
+          R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)",
+    "ts": %lld, 
+    "ph": "s", "cat": "async"
+  },
+  )JSON"),
+      runtime_node.CorrelationId(), runtime_node.ProcessId(),
+      runtime_node.ThreadId(),
+      nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1));
+  pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogDeviceTraceEventNode(
@@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode(
     default:
       break;
   }
+  if (nsToUs(device_node.Duration()) == 0) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(), nsToUs(device_node.StartNs()));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  } else {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  { 
+    "name": "launch", "id": %d, "pid": %lld, "tid": %lld,
+    "ts": %lld, 
+    "ph": "f", "cat": "async", "bp": "e"
+  },
+  )JSON"),
+        device_node.CorrelationId(), device_node.DeviceId(),
+        device_node.StreamId(),
+        nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1));
+    deviceid_streamid_set_.insert(
+        {device_node.DeviceId(), device_node.StreamId()});
+  }
 }
 
 void ChromeTracingLogger::HandleTypeKernel(
@@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel(
   float blocks_per_sm = 0.0;
   float warps_per_sm = 0.0;
   float occupancy = 0.0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUPTI)
   constexpr int threads_per_warp = 32;
   const gpuDeviceProp& device_property =
       GetDeviceProperties(device_node.DeviceId());
-  blocks_per_sm =
-      (kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) /
-      device_property.multiProcessorCount;
+  blocks_per_sm = static_cast<float>(kernel_info.grid_x * kernel_info.grid_y *
+                                     kernel_info.grid_z) /
+                  device_property.multiProcessorCount;
   warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y *
                                   kernel_info.block_z) /
                  threads_per_warp;
+  occupancy = CalculateEstOccupancy(
+      device_node.DeviceId(), kernel_info.registers_per_thread,
+      kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory,
+      kernel_info.block_x, kernel_info.block_y, kernel_info.block_z,
+      blocks_per_sm);
 #endif
 
   output_file_stream_ << string_format(
@@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "registers per thread": %d,
-      "shared memory": %f,
+      "shared memory": %d,
       "blocks per SM": %f,
       "warps per SM": %f,
       "grid": [%d, %d, %d],
       "block": [%d, %d, %d],
-      "est. achieved occupancy %": %f
+      "theoretical achieved occupancy %%": %f
     }
   },
   )JSON"),
@@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), kernel_info.registers_per_thread,
       kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory,
       blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y,
       kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y,
-      kernel_info.block_z, occupancy);
+      kernel_info.block_z, occupancy * 100);
 }
 
 void ChromeTracingLogger::HandleTypeMemcpy(
@@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "memory bandwidth (GB/s)": %f
     }
@@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.StreamId(), device_node.CorrelationId(),
-      memcpy_info.num_bytes, memory_bandwidth);
+      device_node.StartNs(), device_node.EndNs(), device_node.StreamId(),
+      device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth);
 }
 
 void ChromeTracingLogger::HandleTypeMemset(
@@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset(
     "ts": %lld, "dur": %lld,
     "ph": "X", "cat": "%s", 
     "args": {
+      "start_ns": %lld,
+      "end_ns": %lld,
       "device": %d, "context": %d,
       "stream": %d, "correlation id": %d,
       "bytes": %d, "value": %d
@@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset(
       device_node.StreamId(), nsToUs(device_node.StartNs()),
       nsToUs(device_node.Duration()),
       categary_name_[static_cast<int>(device_node.Type())],
-      device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(),
+      device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(),
+      device_node.ContextId(), device_node.StreamId(),
       device_node.CorrelationId(), memset_info.num_bytes, memset_info.value);
 }
 
@@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() {
                                            R"JSON(
   { 
     "schemaVersion": "%s",
-    "displayTimeUnit": "us",
-    "SpanNumber": "%d",
+    "displayTimeUnit": "ms",
+    "span_indx": "%d",
   )JSON"),
-                                       kSchemaVersion, num_span);
+                                       kSchemaVersion, span_indx++);
 // add device property information
 #if defined(PADDLE_WITH_CUDA)
   output_file_stream_ << std::string(R"JSON(
@@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() {
   )JSON");
 }
 
-void ChromeTracingLogger::EndLog() {
+void ChromeTracingLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  RefineDisplayName(extra_info);
   output_file_stream_ << std::string(
       R"JSON(
   {}
-  ]
+  ],
+  )JSON");
+  output_file_stream_ << std::string(R"JSON(
+  "ExtraInfo": {)JSON");
+  size_t count = extra_info.size();
+  for (const auto& kv : extra_info) {
+    if (count > 1) {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s",
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    } else {
+      output_file_stream_ << string_format(std::string(R"JSON(
+     "%s": "%s"
+   )JSON"),
+                                           kv.first.c_str(), kv.second.c_str());
+    }
+    count--;
+  }
+  output_file_stream_ << std::string(R"JSON(
+  })JSON");
+}
+
+void ChromeTracingLogger::RefineDisplayName(
+    std::unordered_map<std::string, std::string> extra_info) {
+  for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+  {
+    "name": "process_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "Process %lld (CPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(Python)"
+    }
+  },
+  {
+    "name": "thread_name", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "name": "thread %lld:%s(C++)"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)",
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).first, (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).second,
+        extra_info[string_format(std::string("%lld"), (*it).second)].c_str(),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1);
+  }
+
+  for (auto it = deviceid_streamid_set_.begin();
+       it != deviceid_streamid_set_.end(); ++it) {
+    output_file_stream_ << string_format(
+        std::string(
+            R"JSON(
+  {
+    "name": "process_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "Deivce %lld (GPU)"
+    }
+  },
+   {
+    "name": "thread_name", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "name": "stream %lld"
+    }
+  },
+  {
+    "name": "process_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  {
+    "name": "thread_sort_index", "pid": %lld, "tid": %lld,
+    "ph": "M", 
+    "args": {
+      "sort_index": %lld
+    }
+  },  
+  )JSON"),
+        (*it).first, (*it).second, (*it).first, (*it).first, (*it).second,
+        (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000,
+        (*it).first, (*it).second, (*it).second);
+  }
+}
+
+void ChromeTracingLogger::EndLog() {
+  output_file_stream_ << std::string(
+      R"JSON(
   }
   )JSON");
 }
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 06734418609d7..20a924a54cabd 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -13,11 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <set>
+#include <unordered_map>
+#include <utility>
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
 namespace paddle {
 namespace platform {
 
+// Dump a NodeTrees into a chrome tracing file.
+// A ChromeTracingLogger object can only dump a NodeTrees object,
+// creates a file in the constructor and closes the file in the destructor.
+// should only call LogNodeTrees and LogMetaInfo in order.
 class ChromeTracingLogger : public BaseLogger {
  public:
   explicit ChromeTracingLogger(const std::string& filename);
@@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
@@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger {
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
   void StartLog();
   void EndLog();
+  void RefineDisplayName(std::unordered_map<std::string, std::string>);
   std::string filename_;
   std::ofstream output_file_stream_;
   static const char* categary_name_[];
+  std::set<std::pair<uint64_t, uint64_t>> pid_tid_set_;
+  std::set<std::pair<uint64_t, uint64_t>> deviceid_streamid_set_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc
index 672a9a154535a..ce2e49a1ccd39 100644
--- a/paddle/fluid/platform/profiler/cpu_utilization.cc
+++ b/paddle/fluid/platform/profiler/cpu_utilization.cc
@@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
-          &system_tms_start_.tms_stime, &idle_start_, &iowait_start_,
-          &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_start_.tms_utime, &nice_time_start_,
+        &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_,
+        &softirq_start_, &steal_start_, &temp_lu, &temp_lu);
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record beginning."
+          << std::endl;
     }
     fclose(stat_file);
   }
@@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() {
   if (stat_file != nullptr) {
     char temp_str[200];
     uint64_t temp_lu;
-    while (true) {
-      int retval = fscanf(
-          stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
-                     "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
-          temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
-          &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
-          &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
-      if (std::string(temp_str).find("cpu") != 0) {
-        break;
-      }
-      if (retval != 11) {
-        return;
-      }
+    int retval = fscanf(
+        stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64
+                   "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64,
+        temp_str, &system_tms_end_.tms_utime, &nice_time_end_,
+        &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_,
+        &softirq_end_, &steal_end_, &temp_lu, &temp_lu);
+
+    if (retval != 11) {
+      LOG(WARNING)
+          << "Failed to read cpu utilization information at record end."
+          << std::endl;
     }
     fclose(stat_file);
   }
diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
index e25333f7a8a73..5045c56afbc63 100644
--- a/paddle/fluid/platform/profiler/dump/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt
@@ -1,4 +1 @@
 proto_library(nodetreeproto SRCS nodetree.proto)
-cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node)
-cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node)
-cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS  serialization_logger deserialization_reader event_node)
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index d1049a7dc1908..de3411579d3e9 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
-
 #include <cstring>
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() {
   }
 }
 
-std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
+std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) {
     VLOG(2) << "Unable to load node trees in protobuf." << std::endl;
     return nullptr;
   }
+  // restore extra info
+  ExtraInfo extrainfo;
+  for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
+    ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
+    extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"),
+                           extra_info_map.value().c_str());
+  }
+  // restore NodeTrees
   std::map<uint64_t, HostTraceEventNode*> thread_event_trees_map;
   for (int node_tree_index = 0;
        node_tree_index < node_trees_proto_->thread_trees_size();
@@ -95,7 +103,9 @@ std::unique_ptr<NodeTrees> DeserializationReader::Parse() {
     }
   }
   // restore NodeTrees object
-  return std::unique_ptr<NodeTrees>(new NodeTrees(thread_event_trees_map));
+  std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
+  return std::unique_ptr<ProfilerResult>(
+      new ProfilerResult(std::move(tree), extrainfo));
 }
 
 DeserializationReader::~DeserializationReader() {
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 1ad2dabf229ad..e6feb4f9489e8 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
-#include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 namespace paddle {
 namespace platform {
@@ -24,7 +24,7 @@ class DeserializationReader {
   explicit DeserializationReader(const std::string& filename);
   explicit DeserializationReader(const char* filename);
   ~DeserializationReader();
-  std::unique_ptr<NodeTrees> Parse();
+  std::unique_ptr<ProfilerResult> Parse();
 
  private:
   void OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 37dac0e597ce2..7016745059d40 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -32,9 +32,21 @@ enum TracerEventTypeProto {
   Memset = 6;
   // Used to mark record defined by user
   UserDefined = 7;
-  // A flag to denote the number of current types
-  NumTypes = 8;
-}
+  // Used to mark operator detail, (such as infer shape, compute)
+  OperatorInner = 8;
+  // Used to mark model training or testing perspective, forward process
+  Forward = 9;
+  // Used to mark model training perspective, backward process
+  Backward = 10;
+  // Used to mark model training perspective, optimization process
+  Optimization = 11;
+  // Used to mark distributed training perspective
+  Communication = 12;
+  // Used to mark python api
+  PythonOp = 13;
+  // Used to mark python level userdefined
+  PythonUserDefined = 14;
+};
 
 message KernelEventInfoProto {
   // The X-dimension block size for the kernel.
@@ -175,7 +187,14 @@ message ThreadNodeTreeProto {
   repeated HostTraceEventNodeProto host_nodes = 2;
 }
 
+message ExtraInfoMap {
+  required string key = 1;
+  required string value = 2;
+}
+
 message NodeTreesProto {
   required string version = 1;
-  repeated ThreadNodeTreeProto thread_trees = 2;
+  required uint32 span_indx = 2;
+  repeated ThreadNodeTreeProto thread_trees = 3;
+  repeated ExtraInfoMap extra_info = 4;
 }
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index d9ed84bd438a7..73021f4362af5 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -13,6 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
@@ -20,6 +21,7 @@ namespace platform {
 
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
 static const char* version = "1.0.0";
+static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
@@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() {
   }
   node_trees_proto_ = new NodeTreesProto();
   node_trees_proto_->set_version(std::string(version));
+  node_trees_proto_->set_span_indx(span_indx++);
 }
 
 void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
@@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset(
       device_trace_event);
 }
 
+void SerializationLogger::LogMetaInfo(
+    const std::unordered_map<std::string, std::string> extra_info) {
+  for (const auto& kv : extra_info) {
+    ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info();
+    extra_info_map->set_key(kv.first);
+    extra_info_map->set_value(kv.second);
+  }
+}
+
 SerializationLogger::SerializationLogger(const std::string& filename) {
   filename_ = filename.empty() ? DefaultFileName() : filename;
   OpenFile();
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
old mode 100755
new mode 100644
index 1295be95d4531..378834cff590d
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -11,6 +11,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <unordered_map>
+
 #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h"
 #include "paddle/fluid/platform/profiler/output_logger.h"
 
@@ -20,6 +22,7 @@ namespace platform {
 // Dump a NodeTrees into a profobuf file.
 // A SerializationLogger object can only dump a NodeTrees object,
 // creates a file in the constructor and closes the file in the destructor.
+// Should only call LogNodeTrees and LogMetaInfo.
 class SerializationLogger : public BaseLogger {
  public:
   explicit SerializationLogger(const std::string& filename);
@@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger {
   void LogHostTraceEventNode(const HostTraceEventNode&) override;
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
+  void LogMetaInfo(const std::unordered_map<std::string, std::string>);
 
  private:
   void OpenFile();
   void HandleTypeKernel(const DeviceTraceEventNode&);
   void HandleTypeMemset(const DeviceTraceEventNode&);
   void HandleTypeMemcpy(const DeviceTraceEventNode&);
+
   std::string filename_;
   std::ofstream output_file_stream_;
   NodeTreesProto* node_trees_proto_;
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 2fe9626ec76df..dee1019da2b52 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -17,6 +17,7 @@
 #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
 #include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 
 using paddle::platform::SerializationLogger;
 using paddle::platform::DeserializationReader;
@@ -31,6 +32,7 @@ using paddle::platform::TracerEventType;
 using paddle::platform::KernelEventInfo;
 using paddle::platform::MemcpyEventInfo;
 using paddle::platform::MemsetEventInfo;
+using paddle::platform::ProfilerResult;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
@@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) {
 
 TEST(DeserializationReaderTest, restore_case0) {
   DeserializationReader reader("test_serialization_logger_case0.pb");
-  std::unique_ptr<NodeTrees> tree = reader.Parse();
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
   std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
       tree->Traverse(true);
   EXPECT_EQ(nodes[10].size(), 4u);
@@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) {
     }
   }
 }
+
+TEST(DeserializationReaderTest, restore_case1) {
+  DeserializationReader reader("test_serialization_logger_case1.pb");
+  auto profiler_result = reader.Parse();
+  auto& tree = profiler_result->GetNodeTrees();
+  std::map<uint64_t, std::vector<HostTraceEventNode*>> nodes =
+      tree->Traverse(true);
+  EXPECT_EQ(nodes[10].size(), 1u);
+  EXPECT_EQ(nodes[11].size(), 1u);
+  std::vector<HostTraceEventNode*> thread1_nodes = nodes[10];
+  std::vector<HostTraceEventNode*> thread2_nodes = nodes[11];
+  for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u);
+    }
+  }
+  for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
+    if ((*it)->Name() == "root node") {
+      EXPECT_EQ((*it)->GetChildren().size(), 0u);
+      EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+    }
+  }
+}
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
new file mode 100644
index 0000000000000..1a6f19d2f93af
--- /dev/null
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/profiler/event_python.h"
+#include "paddle/fluid/platform/profiler/chrometracing_logger.h"
+#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h"
+#include "paddle/fluid/platform/profiler/dump/serialization_logger.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
+
+namespace paddle {
+namespace platform {
+
+HostPythonNode::~HostPythonNode() {
+  // delete all runtime or device nodes and recursive delete children
+  for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end();
+       ++it) {
+    delete *it;
+  }
+  for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
+    delete *it;
+  }
+}
+
+HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
+  // Copy and transfer EventNode in NodeTree to PythonNode
+  if (root == nullptr) {
+    return nullptr;
+  }
+  // copy HostTraceEventNode and its children
+  HostPythonNode* host_python_node = new HostPythonNode();
+  host_python_node->name = root->Name();
+  host_python_node->type = root->Type();
+  host_python_node->start_ns = root->StartNs();
+  host_python_node->end_ns = root->EndNs();
+  host_python_node->process_id = root->ProcessId();
+  host_python_node->thread_id = root->ThreadId();
+  for (auto it = root->GetChildren().begin(); it != root->GetChildren().end();
+       ++it) {
+    host_python_node->children_node_ptrs.push_back(CopyTree(*it));
+  }
+  // copy its CudaRuntimeTraceEventNode
+  for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin();
+       runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) {
+    HostPythonNode* runtime_python_node = new HostPythonNode();
+    runtime_python_node->name = (*runtimenode)->Name();
+    runtime_python_node->type = (*runtimenode)->Type();
+    runtime_python_node->start_ns = (*runtimenode)->StartNs();
+    runtime_python_node->end_ns = (*runtimenode)->EndNs();
+    runtime_python_node->process_id = (*runtimenode)->ProcessId();
+    runtime_python_node->thread_id = (*runtimenode)->ThreadId();
+    host_python_node->runtime_node_ptrs.push_back(runtime_python_node);
+    // copy DeviceTraceEventNode
+    for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin();
+         devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end();
+         ++devicenode) {
+      DevicePythonNode* device_python_node = new DevicePythonNode();
+      device_python_node->name = (*devicenode)->Name();
+      device_python_node->type = (*devicenode)->Type();
+      device_python_node->start_ns = (*devicenode)->StartNs();
+      device_python_node->end_ns = (*devicenode)->EndNs();
+      device_python_node->device_id = (*devicenode)->DeviceId();
+      device_python_node->context_id = (*devicenode)->ContextId();
+      device_python_node->stream_id = (*devicenode)->StreamId();
+      runtime_python_node->device_node_ptrs.push_back(device_python_node);
+    }
+  }
+  return host_python_node;
+}
+
+ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                               const ExtraInfo& extra_info)
+    : tree_(std::move(tree)), extra_info_(extra_info) {
+  if (tree_ != nullptr) {
+    std::map<uint64_t, HostTraceEventNode*> nodetrees = tree_->GetNodeTrees();
+    for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) {
+      thread_event_trees_map_[it->first] = CopyTree(it->second);
+    }
+  }
+}
+
+ProfilerResult::~ProfilerResult() {
+  // delete all root nodes
+  for (auto it = thread_event_trees_map_.begin();
+       it != thread_event_trees_map_.end(); ++it) {
+    delete it->second;
+  }
+}
+
+void ProfilerResult::Save(const std::string& file_name,
+                          const std::string format) {
+  if (format == std::string("json")) {
+    ChromeTracingLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  } else if (format == std::string("pb")) {
+    SerializationLogger logger(file_name);
+    tree_->LogMe(&logger);
+    logger.LogMetaInfo(GetExtraInfo());
+  }
+  return;
+}
+
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename) {
+  DeserializationReader reader(filename);
+  std::unique_ptr<ProfilerResult> result = reader.Parse();
+  return result;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index b0d8eaa242716..12ecb9fde32aa 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -15,8 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include <map>
+#include <memory>
+#include <unordered_map>
 
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 
 namespace paddle {
 namespace platform {
@@ -66,18 +69,29 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-  explicit ProfilerResult(NodeTrees* tree);
+  explicit ProfilerResult(std::unique_ptr<NodeTrees> tree,
+                          const ExtraInfo& extra_info);
   ~ProfilerResult();
   std::map<uint64_t, HostPythonNode*> GetData() {
-    return thread_event_trees_map;
+    return thread_event_trees_map_;
   }
-  void Save(const std::string& file_name);
+  std::unordered_map<std::string, std::string> GetExtraInfo() {
+    return extra_info_.GetExtraInfo();
+  }
+
+  void Save(const std::string& file_name,
+            const std::string format = std::string("json"));
+
+  std::unique_ptr<NodeTrees>& GetNodeTrees() { return tree_; }
 
  private:
-  std::map<uint64_t, HostPythonNode*> thread_event_trees_map;
-  NodeTrees* tree_;
-  HostPythonNode* CopyTree(HostTraceEventNode* node);
+  std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
+  std::unique_ptr<NodeTrees> tree_;
+  ExtraInfo extra_info_;
+  HostPythonNode* CopyTree(HostTraceEventNode* root);
 };
 
+std::unique_ptr<ProfilerResult> LoadProfilerResult(std::string filename);
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 5784d6e671bbb..35dbc96874d3c 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -25,8 +25,10 @@
 #endif
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
+#include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
+#include "paddle/fluid/platform/profiler/utils.h"
 
 namespace paddle {
 namespace platform {
@@ -44,10 +46,15 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
 
 Profiler::Profiler(const ProfilerOptions& options) {
   options_ = options;
-  HostTracerOptions host_tracer_options;
-  host_tracer_options.trace_level = options.trace_level;
-  tracers_.emplace_back(new HostTracer(host_tracer_options), true);
-  tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  std::bitset<32> trace_switch(options_.trace_switch);
+  if (trace_switch.test(kProfileCPUOptionBit)) {
+    HostTracerOptions host_tracer_options;
+    host_tracer_options.trace_level = options_.trace_level;
+    tracers_.emplace_back(new HostTracer(host_tracer_options), true);
+  }
+  if (trace_switch.test(kProfileGPUOptionBit)) {
+    tracers_.emplace_back(&CudaTracer::GetInstance(), false);
+  }
 }
 
 Profiler::~Profiler() { alive_.store(false); }
@@ -63,9 +70,10 @@ void Profiler::Start() {
   for (auto& tracer : tracers_) {
     tracer.Get().StartTracing();
   }
+  cpu_utilization_.RecordBeginTimeInfo();
 }
 
-std::unique_ptr<NodeTrees> Profiler::Stop() {
+std::unique_ptr<ProfilerResult> Profiler::Stop() {
   SynchronizeAllDevice();
   TraceEventCollector collector;
   for (auto& tracer : tracers_) {
@@ -75,7 +83,22 @@ std::unique_ptr<NodeTrees> Profiler::Stop() {
   std::unique_ptr<NodeTrees> tree(new NodeTrees(collector.HostEvents(),
                                                 collector.RuntimeEvents(),
                                                 collector.DeviceEvents()));
-  return tree;
+  cpu_utilization_.RecordEndTimeInfo();
+  ExtraInfo extrainfo;
+  extrainfo.AddExtraInfo(std::string("System Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuUtilization());
+  extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"),
+                         std::string("%f"),
+                         cpu_utilization_.GetCpuCurProcessUtilization());
+  const std::unordered_map<uint64_t, std::string> thread_names =
+      collector.ThreadNames();
+  for (const auto& kv : thread_names) {
+    extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first),
+                           kv.second);
+  }
+  return std::unique_ptr<ProfilerResult>(
+      new platform::ProfilerResult(std::move(tree), extrainfo));
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h
index 4fc1c6daf96c7..f9a8ece050492 100644
--- a/paddle/fluid/platform/profiler/profiler.h
+++ b/paddle/fluid/platform/profiler/profiler.h
@@ -15,12 +15,15 @@
 #pragma once
 
 #include <atomic>
+#include <bitset>
 #include <cstdint>
 #include <functional>
 #include <list>
 #include <memory>
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/profiler/cpu_utilization.h"
 #include "paddle/fluid/platform/profiler/event_node.h"
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/tracer_base.h"
 
 DECLARE_int64(host_trace_level);
@@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level);
 namespace paddle {
 namespace platform {
 
+static constexpr uint32_t kProfileCPUOptionBit = 0;
+static constexpr uint32_t kProfileGPUOptionBit = 1;
+
 struct ProfilerOptions {
+  uint32_t trace_switch = 0;  // bit 0: cpu, bit 1: gpu
   uint32_t trace_level = FLAGS_host_trace_level;
 };
 
@@ -40,7 +47,7 @@ class Profiler {
 
   void Start();
 
-  std::unique_ptr<NodeTrees> Stop();
+  std::unique_ptr<ProfilerResult> Stop();
 
   ~Profiler();
 
@@ -70,6 +77,7 @@ class Profiler {
   ProfilerOptions options_;
   uint64_t start_ns_ = UINT64_MAX;
   std::list<TracerHolder> tracers_;
+  CpuUtilization cpu_utilization_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 160c801dc6e3e..32310b9e86228 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -22,6 +22,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
 
@@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) {
   using paddle::platform::Profiler;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) {
     RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
                        3);
   }
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::set<std::string> host_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto evt : pair.second) {
@@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) {
 TEST(ProfilerTest, TestCudaTracer) {
   using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
+  options.trace_switch = 3;
   auto profiler = Profiler::Create(options);
   EXPECT_TRUE(profiler);
   profiler->Prepare();
@@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) {
   hipStreamCreate(&stream);
   hipStreamSynchronize(stream);
 #endif
-  auto nodetree = profiler->Stop();
+  auto profiler_result = profiler->Stop();
+  auto& nodetree = profiler_result->GetNodeTrees();
   std::vector<std::string> runtime_events;
   for (const auto pair : nodetree->Traverse(true)) {
     for (const auto host_node : pair.second) {
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index 61f96218560ec..16ef62fb51555 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -48,6 +48,8 @@ enum class TracerEventType {
   Communication = 12,
   // Used to mark python api
   PythonOp = 13,
+  // Used to mark python level userdefined
+  PythonUserDefined = 14,
   // A flag to denote the number of current types
   NumTypes
 };

From 1db188f318ae0b0292984e08afd626898e3170da Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 2 Mar 2022 15:37:29 +0800
Subject: [PATCH 09/41] [IPU] update ipu unittests  p0 (#39707)

* update ipu UTs part0

* rename UT

* sync api changes

* update uts for new api

* use_ipumodel() as classmethod
---
 .../tests/unittests/ipu/ernie_training.py     | 934 ------------------
 .../fluid/tests/unittests/ipu/op_test_ipu.py  |  73 +-
 .../unittests/ipu/test_activation_x_op_ipu.py | 133 +++
 .../unittests/ipu/test_arg_max_op_ipu.py      | 117 +++
 .../tests/unittests/ipu/test_assign_op_ipu.py | 102 ++
 .../tests/unittests/ipu/test_avg_shard_ipu.py | 112 ++-
 .../unittests/ipu/test_batch_norm_op_ipu.py   | 108 +-
 ....py => test_batchs_per_step_simple_ipu.py} |  22 +-
 .../tests/unittests/ipu/test_cast_op_ipu.py   | 111 ++-
 .../tests/unittests/ipu/test_concat_op_ipu.py |  93 +-
 .../tests/unittests/ipu/test_conv_op_ipu.py   | 127 +--
 .../ipu/test_cross_entropy2_op_ipu.py         | 128 ++-
 .../tests/unittests/ipu/test_cumsum_op_ipu.py | 123 +++
 13 files changed, 950 insertions(+), 1233 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/unittests/ipu/ernie_training.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
 rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_batchs_per_step_simple.py => test_batchs_per_step_simple_ipu.py} (79%)
 create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py

diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
deleted file mode 100644
index ddda666db2c0c..0000000000000
--- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py
+++ /dev/null
@@ -1,934 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# refrenece : https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/ernie
-
-import os
-import copy
-import argparse
-from contextlib import contextmanager
-from functools import partial
-
-import numpy as np
-import paddle
-import paddle.static
-import paddle.fluid as fluid
-import paddle.fluid.layers as layers
-import paddle.fluid.compiler as compiler
-paddle.enable_static()
-
-SEED = 2021
-INT_DTYPE = None
-
-# ernie related block 
-ernie_config = {
-    "emb_size": 128,
-    "emb_mapping_in": False,
-    "hidden_size": 192,
-    "num_hidden_layers": 2,
-    "n_layer_per_block": 2,
-    "num_attention_heads": 12,
-    "vocab_size": 300,
-    "max_position_embeddings": 512,
-    "sent_type_vocab_size": 4,
-    "task_type_vocab_size": 16,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "attention_probs_dropout_prob": 0.0,
-    "preln": False,
-    "pre_encoder_cmd": "n",
-    "preprocess_cmd": "",
-    "postprocess_cmd": "an",
-    "epsilon": 1e-12,
-    "initializer_range": 0.02,
-    "seq_len": 32
-}
-
-
-def gelu(x):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-      x: float Tensor to perform activation.
-
-    Returns:
-      `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + fluid.layers.tanh(
-        (np.sqrt(2.0 / np.pi) * (x + 0.044715 * fluid.layers.pow(x, 3.0)))))
-    return x * cdf
-
-
-def pre_post_process_layer(prev_out,
-                           out,
-                           process_cmd,
-                           dropout_rate=0.,
-                           epsilon=1e-12,
-                           name=''):
-    """
-    Add residual connection, layer normalization and droput to the out tensor
-    optionally according to the value of process_cmd.
-    This will be used before or after multi-head attention and position-wise
-    feed-forward networks.
-    """
-    for cmd in process_cmd:
-        if cmd == "a":  # add residual connection
-            out = out + prev_out if prev_out else out
-        elif cmd == "n":  # add layer normalization
-            out = layers.layer_norm(
-                out,
-                begin_norm_axis=len(out.shape) - 1,
-                param_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_scale',
-                    initializer=fluid.initializer.Constant(1.)),
-                bias_attr=fluid.ParamAttr(
-                    name=name + '_layer_norm_bias',
-                    initializer=fluid.initializer.Constant(0.)),
-                epsilon=epsilon)
-        elif cmd == "d":  # add dropout
-            if dropout_rate:
-                out = layers.dropout(
-                    out,
-                    dropout_prob=dropout_rate,
-                    dropout_implementation="upscale_in_train",
-                    is_test=False)
-    return out
-
-
-pre_process_layer = partial(pre_post_process_layer, None)
-post_process_layer = pre_post_process_layer
-
-
-def positionwise_feed_forward(x,
-                              d_inner_hid,
-                              d_hid,
-                              dropout_rate,
-                              hidden_act,
-                              param_initializer=None,
-                              name='ffn'):
-    """
-    Position-wise Feed-Forward Networks.
-    This module consists of two linear transformations with a ReLU activation
-    in between, which is applied to each position separately and identically.
-    """
-
-    #assert hidden_act == 'gelu.approximate'
-    hidden = layers.fc(input=x,
-                       size=d_inner_hid,
-                       num_flatten_dims=2,
-                       act=None,
-                       param_attr=fluid.ParamAttr(
-                           name=name + '_fc_0.w_0',
-                           initializer=param_initializer),
-                       bias_attr=name + '_fc_0.b_0')
-    hidden = gelu(hidden)
-
-    if dropout_rate:
-        hidden = layers.dropout(
-            hidden,
-            dropout_prob=dropout_rate,
-            dropout_implementation="upscale_in_train",
-            is_test=False)
-
-    out = layers.fc(input=hidden,
-                    size=d_hid,
-                    num_flatten_dims=2,
-                    param_attr=fluid.ParamAttr(
-                        name=name + '_fc_1.w_0', initializer=param_initializer),
-                    bias_attr=name + '_fc_1.b_0')
-
-    return out
-
-
-def multi_head_attention(queries,
-                         keys,
-                         values,
-                         attn_bias,
-                         d_key,
-                         d_value,
-                         d_model,
-                         n_head=1,
-                         dropout_rate=0.,
-                         cache=None,
-                         param_initializer=None,
-                         name='multi_head_att'):
-    """
-    Multi-Head Attention. Note that attn_bias is added to the logit before
-    computing softmax activiation to mask certain selected positions so that
-    they will not considered in attention weights.
-    """
-    keys = queries if keys is None else keys
-    values = keys if values is None else values
-
-    def __compute_qkv(queries, keys, values, n_head, d_key, d_value):
-        """
-        Add linear projection to queries, keys, and values.
-        """
-        q = layers.fc(input=queries,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_query_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_query_fc.b_0')
-        k = layers.fc(input=keys,
-                      size=d_key * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_key_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_key_fc.b_0')
-        v = layers.fc(input=values,
-                      size=d_value * n_head,
-                      num_flatten_dims=2,
-                      param_attr=fluid.ParamAttr(
-                          name=name + '_value_fc.w_0',
-                          initializer=param_initializer),
-                      bias_attr=name + '_value_fc.b_0')
-
-        return q, k, v
-
-    def __split_heads(x, n_head):
-        """
-        Reshape the last dimension of inpunt tensor x so that it becomes two
-        dimensions and then transpose. Specifically, input a tensor with shape
-        [bs, max_sequence_length, n_head * hidden_dim] then output a tensor
-        with shape [bs, n_head, max_sequence_length, hidden_dim].
-        """
-        hidden_size = x.shape[-1]
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        reshaped = layers.reshape(
-            x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=False)
-
-        # permuate the dimensions into:
-        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
-        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
-
-    def __combine_heads(x):
-        """
-        Transpose and then reshape the last two dimensions of inpunt tensor x
-        so that it becomes one dimension, which is reverse to __split_heads.
-        """
-        if len(x.shape) == 3: return x
-        if len(x.shape) != 4:
-            raise ValueError("Input(x) should be a 4-D Tensor.")
-
-        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # The value 0 in shape attr means copying the corresponding dimension
-        # size of the input as the output dimension size.
-        return layers.reshape(
-            x=trans_x,
-            shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]],
-            inplace=False)
-
-    def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate):
-        """
-        Scaled Dot-Product Attention
-        """
-        scaled_q = layers.scale(x=q, scale=d_key**-0.5)
-        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
-
-        if attn_bias:
-            product += attn_bias
-        weights = layers.softmax(product)
-        if dropout_rate:
-            weights = layers.dropout(
-                weights,
-                dropout_prob=dropout_rate,
-                dropout_implementation="upscale_in_train",
-                is_test=False)
-        out = layers.matmul(weights, v)
-        return out
-
-    q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value)
-
-    if cache is not None:  # use cache and concat time steps
-        # Since the inplace reshape in __split_heads changes the shape of k and
-        # v, which is the cache input for next time step, reshape the cache
-        # input from the previous time step first.
-        k = cache["k"] = layers.concat(
-            [layers.reshape(
-                cache["k"], shape=[0, 0, d_model]), k], axis=1)
-        v = cache["v"] = layers.concat(
-            [layers.reshape(
-                cache["v"], shape=[0, 0, d_model]), v], axis=1)
-
-    q = __split_heads(q, n_head)
-    k = __split_heads(k, n_head)
-    v = __split_heads(v, n_head)
-
-    ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key,
-                                                  dropout_rate)
-
-    out = __combine_heads(ctx_multiheads)
-
-    # Project back to the model size.
-    proj_out = layers.fc(input=out,
-                         size=d_model,
-                         num_flatten_dims=2,
-                         param_attr=fluid.ParamAttr(
-                             name=name + '_output_fc.w_0',
-                             initializer=param_initializer),
-                         bias_attr=name + '_output_fc.b_0')
-
-    return proj_out
-
-
-def encoder_layer(enc_input,
-                  attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  hidden_act,
-                  preprocess_cmd="n",
-                  postprocess_cmd="da",
-                  param_initializer=None,
-                  name='',
-                  epsilon=1e-12):
-    """The encoder layers that can be stacked to form a deep encoder.
-    This module consits of a multi-head (self) attention followed by
-    position-wise feed-forward networks and both the two components companied
-    with the post_process_layer to add residual connection, layer normalization
-    and droput.
-    """
-
-    attn_output = multi_head_attention(
-        enc_input,
-        None,
-        None,
-        attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        param_initializer=param_initializer,
-        name=name + '_multi_head_att')
-
-    attn_output = post_process_layer(
-        enc_input,
-        attn_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_att',
-        epsilon=epsilon)
-
-    ffd_output = positionwise_feed_forward(
-        attn_output,
-        d_inner_hid,
-        d_model,
-        relu_dropout,
-        hidden_act,
-        param_initializer=param_initializer,
-        name=name + '_ffn')
-
-    post_output = post_process_layer(
-        attn_output,
-        ffd_output,
-        'an',
-        prepostprocess_dropout,
-        name=name + '_post_ffn',
-        epsilon=epsilon)
-
-    return post_output
-
-
-def encoder_inner_share(enc_input,
-                        attn_bias,
-                        n_head,
-                        d_key,
-                        d_value,
-                        d_model,
-                        d_inner_hid,
-                        prepostprocess_dropout,
-                        attention_dropout,
-                        relu_dropout,
-                        hidden_act,
-                        preprocess_cmd,
-                        postprocess_cmd,
-                        epsilon,
-                        param_initializer=None,
-                        name='',
-                        n_layer_per_block=1):
-    """
-       The encoder_inner_share is composed of n_layer_per_block layers returned by calling
-       encoder_layer.
-    """
-
-    for i in range(n_layer_per_block):
-        enc_output = encoder_layer(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            param_initializer=param_initializer,
-            name=name + '_layer_' + str(i),
-            epsilon=epsilon)
-
-        enc_input = enc_output
-
-    return enc_output
-
-
-def encoder(enc_input,
-            attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            n_layer_per_block,
-            param_initializer=None,
-            name='',
-            preln=False):
-    """
-    The encoder is composed of a stack of identical layers returned by calling
-    encoder_layer .
-    """
-
-    for _ in range(n_layer // n_layer_per_block):
-        attn_bias.stop_gradient = True
-        attn_bias.persistable = False
-        enc_output = encoder_inner_share(
-            enc_input,
-            attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            hidden_act,
-            preprocess_cmd,
-            postprocess_cmd,
-            epsilon,
-            param_initializer=param_initializer,
-            name=name,
-            n_layer_per_block=n_layer_per_block)
-
-        enc_input = enc_output
-
-    if preln:
-        enc_output = post_process_layer(
-            None,
-            enc_output,
-            'n',
-            prepostprocess_dropout,
-            name='post_encoder',
-            epsilon=epsilon)
-
-    enc_output = pre_process_layer(
-        enc_output,
-        preprocess_cmd,
-        prepostprocess_dropout,
-        name="post_encoder",
-        epsilon=epsilon)
-
-    return enc_output
-
-
-class ErnieModel(object):
-    def __init__(self, src_ids, sent_ids, pos_ids, input_mask, config):
-
-        self._emb_size = config['emb_size'] if config[
-            'emb_mapping_in'] else config['hidden_size']
-        self._hidden_size = config['hidden_size']
-        self._n_layer = config['num_hidden_layers']
-        self._n_head = config['num_attention_heads']
-        self._voc_size = config['vocab_size']
-        self._max_position_seq_len = config['max_position_embeddings']
-        self._sent_types = config['sent_type_vocab_size']
-        self._task_types = config['task_type_vocab_size']
-        self._hidden_act = config['hidden_act']
-        self._prepostprocess_dropout = config['hidden_dropout_prob']
-        self._attention_dropout = config['attention_probs_dropout_prob']
-        self.config = config
-        self.preln = config['preln'] if 'preln' in config.keys() else False
-        self.pre_encoder_cmd = "" if self.preln else self.config[
-            'pre_encoder_cmd']
-
-        self._word_emb_name = "word_embedding"
-        self._pos_emb_name = "pos_embedding"
-        self._sent_emb_name = "sent_embedding"
-        self._task_emb_name = "task_embedding"
-        self._dtype = "float32"
-        self._emb_dtype = "float32"
-
-        # Initialize all weigths by truncated normal initializer, and all biases
-        # will be initialized by constant zero by default.
-        self._param_initializer = fluid.initializer.TruncatedNormal(
-            scale=config['initializer_range'])
-
-        self.src_ids = src_ids
-        self.sent_ids = sent_ids
-        self.pos_ids = pos_ids
-        self.input_mask = input_mask
-        '''
-        _build_position_ids: range op doesn't support
-        _build_input_mask: logic_not op doesn't support
-        '''
-
-        self._build_model()
-
-    def _build_model(self, emb=None):
-        with fluid.ipu_shard(ipu_index=0, ipu_stage=0):
-            # padding id in vocabulary must be set to 0
-            self.emb_out = fluid.layers.embedding(
-                input=self.src_ids,
-                size=[self._voc_size, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._word_emb_name,
-                    initializer=self._param_initializer),
-                is_sparse=False)
-
-            self.position_emb_out = fluid.layers.embedding(
-                input=self.pos_ids,
-                size=[self._max_position_seq_len, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._pos_emb_name,
-                    initializer=self._param_initializer))
-
-            self.sent_emb_out = fluid.layers.embedding(
-                self.sent_ids,
-                size=[self._sent_types, self._emb_size],
-                dtype=self._emb_dtype,
-                param_attr=fluid.ParamAttr(
-                    name=self._sent_emb_name,
-                    initializer=self._param_initializer))
-
-            sum_emb = self.emb_out + self.position_emb_out + self.sent_emb_out
-
-            sum_emb = pre_process_layer(
-                sum_emb,
-                self.config['pre_encoder_cmd'],
-                self._prepostprocess_dropout,
-                name='pre_encoder',
-                epsilon=self.config['epsilon'])
-
-            if self.config['emb_mapping_in']:
-                sum_emb = fluid.layers.fc(
-                    input=sum_emb,
-                    num_flatten_dims=2,
-                    size=self._hidden_size,
-                    param_attr=fluid.ParamAttr(
-                        name='emb_hidden_mapping',
-                        initializer=self._param_initializer),
-                    bias_attr='emb_hidden_mapping_bias')
-
-            self_attn_mask = fluid.layers.matmul(
-                x=self.input_mask, y=self.input_mask, transpose_y=True)
-
-            self_attn_mask = fluid.layers.scale(
-                x=self_attn_mask,
-                scale=10000.0,
-                bias=-1.0,
-                bias_after_scale=False)
-
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            n_head_self_attn_mask = fluid.layers.stack(
-                x=[self_attn_mask] * self._n_head,
-                axis=1)  # [bs, _n_head, seqlen, seq_len]
-            n_head_self_attn_mask.stop_gradient = True
-
-            self._enc_out = encoder(
-                enc_input=sum_emb,
-                attn_bias=n_head_self_attn_mask,
-                n_layer=self._n_layer,
-                n_head=self._n_head,
-                d_key=self._hidden_size // self._n_head,
-                d_value=self._hidden_size // self._n_head,
-                d_model=self._hidden_size,
-                d_inner_hid=self._hidden_size * 4,
-                prepostprocess_dropout=self._prepostprocess_dropout,
-                attention_dropout=self._attention_dropout,
-                relu_dropout=0,
-                hidden_act=self._hidden_act,
-                preprocess_cmd=self.config['preprocess_cmd'],
-                postprocess_cmd=self.config['postprocess_cmd'],
-                param_initializer=self._param_initializer,
-                name='encoder',
-                epsilon=self.config['epsilon'],
-                n_layer_per_block=self.config['n_layer_per_block'],
-                preln=self.preln)
-
-    def _build_position_ids(self):
-        d_shape = fluid.layers.shape(self.src_ids)
-        d_seqlen = d_shape[1]
-        d_batch = d_shape[0]
-        position_ids = fluid.layers.reshape(
-            fluid.layers.range(
-                0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1],
-            inplace=False)
-        position_ids = fluid.layers.expand(position_ids, [d_batch, 1, 1])
-        position_ids = fluid.layers.cast(position_ids, INT_DTYPE)
-        position_ids.stop_gradient = True
-        return position_ids
-
-    def _build_input_mask(self):
-        zero = fluid.layers.fill_constant([1], dtype=INT_DTYPE, value=0)
-        input_mask = fluid.layers.logical_not(
-            fluid.layers.equal(self.src_ids, zero))  # assume pad id == 0
-        input_mask = fluid.layers.cast(input_mask, 'float32')
-        input_mask.stop_gradient = True
-        return input_mask
-
-    def get_sequence_output(self):
-        return self._enc_out
-
-    def get_pooled_output(self):
-        """Get the first feature of each sequence for classification"""
-        next_sent_feat = fluid.layers.slice(
-            input=self._enc_out, axes=[1], starts=[0], ends=[1])
-
-        next_sent_feat = fluid.layers.fc(
-            input=next_sent_feat,
-            size=self._hidden_size,
-            act="tanh",
-            param_attr=fluid.ParamAttr(
-                name="pooled_fc.w_0", initializer=self._param_initializer),
-            bias_attr="pooled_fc.b_0")
-        return next_sent_feat
-
-    def get_next_sentence_output(self, labels):
-        next_sent_feat = self.get_pooled_output()
-        next_sent_fc_out = fluid.layers.fc(
-            input=next_sent_feat,
-            num_flatten_dims=1,
-            size=33,
-            param_attr=fluid.ParamAttr(
-                name="next_sent_fc.w_0", initializer=self._param_initializer),
-            bias_attr="next_sent_fc.b_0")
-        next_sent_fc_out = fluid.layers.reshape(
-            next_sent_fc_out, [-1, 33], inplace=False)
-        #next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=next_sent_fc_out, label=labels, return_softmax=True)
-        next_sent_softmax = fluid.layers.softmax(next_sent_fc_out)
-        next_sent_loss = fluid.layers.cross_entropy(next_sent_softmax, labels)
-        next_sent_acc = fluid.layers.accuracy(
-            input=next_sent_softmax, label=labels)
-        mean_next_sent_loss = fluid.layers.mean(next_sent_loss,
-                                                "mean_next_sent_loss")
-        return next_sent_acc, mean_next_sent_loss
-
-    def get_lm_output(self, mask_label, mask_pos):
-        """Get the loss & accuracy for pretraining"""
-        mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
-
-        # extract the first token feature in each sentence
-        reshaped_emb_out = fluid.layers.reshape(
-            x=self._enc_out, shape=[-1, self._hidden_size])
-
-        # extract masked tokens' feature
-        mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
-        if self._dtype == "float16":
-            mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype)
-
-        # transform: fc
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            _hidden_act = 'gelu'
-        else:
-            _hidden_act = None
-
-        mask_trans_feat = fluid.layers.fc(
-            input=mask_feat,
-            size=self._emb_size,
-            act=_hidden_act,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_fc.w_0',
-                initializer=self._param_initializer),
-            bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0'))
-
-        if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise':
-            pass
-        else:
-            mask_trans_feat = gelu(mask_trans_feat)
-
-        # transform: layer norm
-        mask_trans_feat = fluid.layers.layer_norm(
-            mask_trans_feat,
-            begin_norm_axis=len(mask_trans_feat.shape) - 1,
-            param_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_scale',
-                initializer=fluid.initializer.Constant(1.)),
-            bias_attr=fluid.ParamAttr(
-                name='mask_lm_trans_layer_norm_bias',
-                initializer=fluid.initializer.Constant(0.)),
-            epsilon=self.config['epsilon'])
-
-        mask_lm_out_bias_attr = fluid.ParamAttr(
-            name="mask_lm_out_fc.b_0",
-            initializer=fluid.initializer.Constant(value=0.0))
-
-        fc_out = fluid.layers.fc(input=mask_trans_feat,
-                                 size=self._voc_size,
-                                 param_attr=fluid.ParamAttr(
-                                     name="mask_lm_out_fc.w_0",
-                                     initializer=self._param_initializer),
-                                 bias_attr=mask_lm_out_bias_attr)
-        #mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
-        #    logits=fc_out, label=mask_label)
-        mask_lm_softmax = fluid.layers.softmax(fc_out)
-        mask_lm_loss = fluid.layers.cross_entropy(mask_lm_softmax, mask_label)
-        mean_mask_lm_loss = fluid.layers.mean(
-            mask_lm_loss, name="mean_mask_lm_loss")
-
-        return mask_lm_loss, mean_mask_lm_loss
-
-    def get_task_output(self, task, task_labels):
-        task_fc_out = fluid.layers.fc(input=self.next_sent_feat,
-                                      size=task["num_labels"],
-                                      param_attr=fluid.ParamAttr(
-                                          name=task["task_name"] + "_fc.w_0",
-                                          initializer=self._param_initializer),
-                                      bias_attr=task["task_name"] + "_fc.b_0")
-        #task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy(
-        #    logits=task_fc_out, label=task_labels, return_softmax=True)
-        task_softmax = fluid.layers.softmax(task_fc_out)
-        task_loss = fluid.layers.cross_entropy(task_softmax, task_labels)
-        task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels)
-        mean_task_loss = fluid.layers.mean(task_loss)
-        return mean_task_loss, task_acc
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(__doc__)
-    parser.add_argument(
-        "--run_on_ipu", type=bool, default=True, help="Run model with IPU")
-    parser.add_argument(
-        "--is_training", type=bool, default=True, help="Train of inference")
-    parser.add_argument(
-        "--num_ipus", type=int, default=2, help="Number of ipus")
-    parser.add_argument(
-        "--enable_pipelining", type=bool, default=False, help="Pipelining")
-    parser.add_argument(
-        "--save_model", type=bool, default=False, help="Save model or not")
-    parser.add_argument(
-        "--model_path", type=str, default="ernie", help="Save model to where")
-    parser.add_argument(
-        "--model_name", type=str, default="ernie", help="Save model name")
-    parser.add_argument(
-        "--ipu_run_steps", type=int, default=10, help="Number steps exe.run()")
-    parser.add_argument(
-        "--export_ops", type=bool, default=False, help="Export ops to ops.txt")
-    parser.add_argument(
-        "--export_ipu_idx", type=bool, default=False, help="Export op-idx pair")
-    args = parser.parse_args()
-
-    # set random seed
-    np.random.seed(SEED)
-    paddle.static.default_startup_program().random_seed = SEED
-    paddle.static.default_main_program().random_seed = SEED
-
-    # IPU doesn't support int64, so we change here
-    INT_DTYPE = "int32" if args.run_on_ipu else "int64"
-
-    # paddle input placeholder, batch_size = 1
-    micro_bs = 1
-    seq_len = ernie_config["seq_len"]
-    input_shape = [micro_bs, seq_len, 1]
-    input_fields = {
-        'names': [
-            'src_ids', 'sent_ids', 'pos_ids', 'input_mask', 'mask_label',
-            'mask_pos'
-        ],
-        'shapes': [
-            input_shape, input_shape, input_shape, input_shape, [micro_bs, 1],
-            [micro_bs, 1]
-        ],
-        'dtypes':
-        [INT_DTYPE, INT_DTYPE, INT_DTYPE, 'float32', INT_DTYPE, INT_DTYPE],
-        'range': [[0, seq_len], [0, 4], [0, seq_len], None, [0, seq_len],
-                  [0, seq_len]],
-        'lod_levels': [0, 0, 0, 0, 0, 0],
-    }
-
-    inputs = [
-        fluid.data(
-            name=input_fields['names'][i],
-            shape=input_fields['shapes'][i],
-            dtype=input_fields['dtypes'][i],
-            lod_level=input_fields['lod_levels'][i])
-        for i in range(len(input_fields['names']))
-    ]
-
-    # total_samples: assum disable pipelining
-    batches_per_step = 1
-    if args.enable_pipelining:
-        batches_per_step = \
-            ((args.num_ipus+1) if args.is_training else args.num_ipus)
-    total_samples = args.ipu_run_steps * batches_per_step
-
-    total_steps = args.ipu_run_steps
-    if not args.run_on_ipu:  # run on cpu
-        total_steps = total_samples // micro_bs
-
-    # synthetic data
-    np_inputs = []
-    for i in range(len(input_fields['names'])):
-        field_name = input_fields['names'][i]
-        if field_name == 'input_mask':
-            src_ids = np_inputs[0]
-            dtype = input_fields['dtypes'][i]
-            data = np.where(src_ids > 0,
-                            np.ones_like(src_ids),
-                            np.zeros_like(src_ids)).astype(dtype)
-        else:
-            shape = copy.copy(input_fields['shapes'][i])
-            shape[0] = total_samples
-            min_val, max_val = input_fields['range'][i]
-            data = np.random.randint(
-                min_val, max_val, shape, dtype=input_fields['dtypes'][i])
-        np_inputs.append(data)
-
-    # paddle input placeholder
-    (src_ids, sent_ids, pos_ids, input_mask, mask_label, mask_pos) = inputs
-
-    # ernie model
-    ernie = ErnieModel(src_ids, sent_ids, pos_ids, input_mask, ernie_config)
-    fetch_node = ernie.get_sequence_output()
-    if args.is_training:
-        with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-            _, mean_mask_lm_loss = ernie.get_lm_output(mask_label, mask_pos)
-            fetch_node = mean_mask_lm_loss
-            adam = paddle.optimizer.Adam(learning_rate=1e-2)
-            adam.minimize(mean_mask_lm_loss)
-
-    # place = paddle.CPUPlace()
-    if args.run_on_ipu:
-        place = paddle.IPUPlace()
-    else:
-        place = paddle.CPUPlace()
-    executor = paddle.static.Executor(place)
-
-    # feed & fetch list
-    if args.is_training:
-        feed_list = input_fields['names']
-    else:
-        feed_list = input_fields['names'][:4]
-    fetch_list = [fetch_node.name]
-
-    # program
-    startup_prog = paddle.static.default_startup_program()
-    executor.run(startup_prog)
-
-    main_prog = paddle.static.default_main_program()
-    paddle.static.save(main_prog, "model/ernie")
-    paddle.static.load(main_prog, "model/ernie")
-
-    if args.run_on_ipu:
-        ipu_strategy = paddle.static.IpuStrategy()
-        ipu_strategy.SetGraphConfig(
-            num_ipus=args.num_ipus,
-            is_training=args.is_training,
-            enable_manual_shard=args.num_ipus > 1)
-        ipu_strategy.SetPipeliningConfig(
-            enable_pipelining=args.enable_pipelining,
-            batches_per_step=args.num_ipus + 1)
-
-        ipu_compiler = compiler.IPUCompiledProgram(
-            main_prog, ipu_strategy=ipu_strategy)
-        program = ipu_compiler.compile(feed_list, fetch_list)
-    else:
-        program = main_prog
-
-    # executor run
-    results = []
-    for i in range(total_steps):
-        start = i * (batches_per_step if args.run_on_ipu else 1)
-        end = start + (batches_per_step if args.run_on_ipu else 1)
-        feed_dict = {
-            src_ids.name: np_inputs[0][start:end],
-            sent_ids.name: np_inputs[1][start:end],
-            pos_ids.name: np_inputs[2][start:end],
-            input_mask.name: np_inputs[3][start:end]
-        }
-        if args.is_training:
-            feed_dict[mask_label.name] = np_inputs[4][start:end]
-            feed_dict[mask_pos.name] = np_inputs[5][start:end]
-
-        res = executor.run(program, feed=feed_dict, fetch_list=[fetch_node])
-        results.append(res)
-
-    paddle.static.save(main_prog, "model/ernie")
-
-    results = np.asarray(results).flatten()
-    if results.size > 32:
-        results = results[-32:]
-    print(results)
-
-    if args.save_model:
-        full_name = args.model_path + '/' + args.model_name
-        if args.is_training:
-            fluid.save(program=main_prog, model_path=full_name)
-        else:
-            with fluid.ipu_shard(ipu_index=1, ipu_stage=1):
-                paddle.static.save_inference_model(
-                    full_name, [src_ids, sent_ids, pos_ids, input_mask],
-                    [fetch_node], executor)
-
-    if args.export_ops:
-        op_type_list = []
-        for op in main_prog.global_block().ops:
-            op_type_list.append(op.desc.type())
-
-        with open("ops.txt", "w") as fp:
-            for op_type in set(op_type_list):
-                fp.write(op_type + os.linesep)
-
-    if args.export_ipu_idx:
-        op_ipu_idx_list = []
-        for op in main_prog.global_block().ops:
-            if op._is_backward_op():
-                continue
-
-            op_ipu_idx_pair = [op.desc.type()]
-            if op.desc.has_attr("ipu_index"):
-                op_ipu_idx_pair.append(op.desc.attr("ipu_index"))
-            else:
-                op_ipu_idx_pair.append(-1)  # not assign ipu_index
-            op_ipu_idx_list.append(op_ipu_idx_pair)
-        op_ipu_idx_list.sort(key=lambda item: item[-1])
-
-        with open("ops_ipu_idx.txt", "w") as fp:
-            for op_ipu_idx_pair in op_ipu_idx_list:
-                fp.write(str(op_ipu_idx_pair) + os.linesep)
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 0d09f60406001..790388f30ead9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -12,17 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
 import unittest
-
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
-from typing import Optional
-import paddle.fluid.compiler as compiler
-
-SEED = 2021
+from enum import Enum
 
-ipu_compiler_ref: Optional[compiler.IPUCompiledProgram] = None
+import paddle
+import paddle.static
 
 map_np_dtype_to_fluid_dtype = {
     'bool': "bool",
@@ -36,6 +33,19 @@
 }
 
 
+class ExecutionMode(Enum):
+    CPU_FP32 = 1
+    IPU_FP32 = 2
+    # enable_fp16 through ipu_strategy.enable_fp16
+    IPU_POPART_FP16 = 3
+
+    def __lt__(self, other):
+        return self.value < other.value
+
+    def __gt__(self, other):
+        return self.value > other.value
+
+
 def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
     return map_np_dtype_to_fluid_dtype[dtype.name]
 
@@ -43,14 +53,16 @@ def np_dtype_to_fluid_str(dtype: np.dtype) -> str:
 class IPUOpTest(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
+        # Get random seeds
         cls._np_rand_state = np.random.get_state()
         cls._py_rand_state = random.getstate()
 
-        cls.SEED = SEED
+        cls.SEED = 2021
         np.random.seed(cls.SEED)
         random.seed(cls.SEED)
 
-        cls._use_system_allocator = _set_use_system_allocator(True)
+        # Enable paddle static graph mode
+        paddle.enable_static()
 
     @classmethod
     def tearDownClass(cls):
@@ -58,14 +70,47 @@ def tearDownClass(cls):
         np.random.set_state(cls._np_rand_state)
         random.setstate(cls._py_rand_state)
 
-        _set_use_system_allocator(cls._use_system_allocator)
-        # unittest will to trigger IPUCompiledProgram.__del__ automatically
-        global ipu_compiler_ref
-        ipu_compiler_ref is not None and ipu_compiler_ref.clean()
+    @classmethod
+    def use_ipumodel(cls):
+        if 'POPLAR_IPUMODEL' not in os.environ:
+            return False
+        else:
+            flag = os.environ['POPLAR_IPUMODEL']
+            if flag.upper() in ['1', "TRUE"]:
+                return True
 
     def set_atol(self):
-        self.atol = 1e-5
+        self.atol = 1e-10
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
 
     def set_training(self):
         self.is_training = False
         self.epoch = 1
+
+    def check(self, outputs, check_shape=False):
+        cpu_fp32 = outputs[ExecutionMode.CPU_FP32]
+        ipu_fp32 = outputs[ExecutionMode.IPU_FP32]
+        max_diff = np.abs(cpu_fp32 - ipu_fp32).max()
+        fp32_flag = np.allclose(
+            cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol)
+        self.assertTrue(fp32_flag, "max diff is %f" % (max_diff))
+
+        if check_shape:
+            self.assertTrue(cpu_fp32.shape == ipu_fp32.shape)
+
+        ipu_popart_fp16 = None
+        if ExecutionMode.IPU_POPART_FP16 in outputs.keys():
+            ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16]
+            max_diff = np.abs(ipu_popart_fp16.astype(np.float32) -
+                              cpu_fp32).max()
+            fp16_flag = np.allclose(
+                ipu_popart_fp16.astype(np.float32),
+                cpu_fp32,
+                rtol=self.rtol_fp16,
+                atol=self.atol_fp16)
+            self.assertTrue(fp16_flag, "max diff is %f" % (max_diff))
+
+            if check_shape:
+                self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
new file mode 100644
index 0000000000000..138365b650f24
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py
@@ -0,0 +1,133 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestRelu(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_test_op()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.relu
+        self.op_attrs = {}
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = self.op(x, **self.op_attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestTanh(TestRelu):
+    def set_test_op(self):
+        self.op = F.tanh
+        self.op_attrs = {}
+
+
+class TestLog(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.log
+        self.op_attrs = {}
+
+
+class TestSigmoid(TestRelu):
+    def set_test_op(self):
+        self.op = F.sigmoid
+        self.op_attrs = {}
+
+
+class TestSqrt(TestRelu):
+    def set_test_op(self):
+        self.op = paddle.fluid.layers.sqrt
+        self.op_attrs = {}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
new file mode 100644
index 0000000000000..d14eba98ef5d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py
@@ -0,0 +1,117 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[10, 1000])
+        self.feed_fp32 = {"in_0": data.astype(np.float32)}
+        self.feed_fp16 = {"in_0": data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {"axis": -1}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                out = paddle.fluid.layers.argmax(x, **self.attrs)
+
+            fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0].astype(np.int32)
+
+    def test_base(self):
+        output_dict_fp32 = {}
+        output_dict_fp16 = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            if mode > ExecutionMode.IPU_FP32:
+                output_dict_fp16[mode] = self._test_base(mode).flatten()
+            else:
+                output_dict_fp32[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict_fp32)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"axis": 0}
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
new file mode 100644
index 0000000000000..4f17c90de72ad
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py
@@ -0,0 +1,102 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[2, 3, 1])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype='float32')
+
+                assign = paddle.assign(x)
+                out = paddle.fluid.layers.elementwise_add(assign, assign)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
index a23cacf47636b..f34e5b0d8b9dc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py
@@ -16,13 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,78 +26,89 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 128, 128])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
-        self.attrs = {}
-
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        self.set_data_feed()
+        self.set_feed_attr()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 2e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 128, 128])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
-                conv1 = paddle.static.nn.conv2d(
+
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
+                    x, num_filters=3, filter_size=3, bias_attr=False)
+                x = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
-                conv2 = paddle.static.nn.conv2d(
-                    conv1, num_filters=3, filter_size=3, bias_attr=False)
-                conv3 = paddle.static.nn.conv2d(
-                    conv2, num_filters=3, filter_size=3, bias_attr=False)
-                conv4 = paddle.static.nn.conv2d(
-                    conv3, num_filters=3, filter_size=3, bias_attr=False)
 
-                fetch_list = [conv4.name]
+                fetch_list = [x.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=self.is_training,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                ipu_strategy.set_options({'need_avg_shard': True})
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
index 87f783dbd1c1a..1dab958c1ecbc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py
@@ -16,13 +16,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,76 +27,100 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-5
+        self.atol_fp16 = 1e-2
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = False
         self.attrs['data_layout'] = 'NCHW'
         self.attrs['in_place'] = False
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 conv1 = paddle.static.nn.conv2d(
                     x, num_filters=3, filter_size=3, bias_attr=False)
                 out = paddle.fluid.layers.batch_norm(conv1, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
@@ -108,7 +128,13 @@ def set_attrs(self):
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_atol(self):
+        self.atol = 1e-7
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['is_test'] = True
         self.attrs['data_layout'] = 'NCHW'
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
similarity index 79%
rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
rename to python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
index 9b485d7794db2..ef61e651b2ad9 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py
@@ -17,8 +17,7 @@
 import numpy as np
 import unittest
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
+import paddle.static
 
 paddle.enable_static()
 SEED = 2021
@@ -28,7 +27,7 @@
                  "core is not compiled with IPU")
 class TestFunc(unittest.TestCase):
     def _test_func(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
         main_prog.random_seed = SEED
@@ -40,22 +39,20 @@ def _test_func(self, run_ipu=True):
         c, h, w = 3, 10, 10
         np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32)
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name='image', shape=[n, c, h, w], dtype='float32')
                 conv2d = paddle.static.nn.conv2d(
                     image, num_filters=3, filter_size=3, bias_attr=False)
 
-                # paddle.mean oshape on ipu is [bps], need another mean()
-                # paddle.mean oshape on cpu is [1]
-                # out = paddle.mean(conv2d)
                 out = conv2d
 
             if run_ipu:
                 place = paddle.IPUPlace()
             else:
                 place = paddle.CPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
@@ -63,14 +60,9 @@ def _test_func(self, run_ipu=True):
                 feed_list = [image.name]
                 fetch_list = [out.name]
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(
-                    num_ipus=2,
-                    is_training=False,
-                    enable_manual_shard=True,
-                    need_avg_shard=True)
-                ipu_strategy.SetPipeliningConfig(
-                    enable_pipelinin=True, batches_per_step=bps)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=False)
+                ipu_strategy.set_pipelining_config(batches_per_step=bps)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog, ipu_strategy=ipu_strategy).compile(feed_list,
                                                                   fetch_list)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
index 6e58f80904600..5f0eeaa2f99ab 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,14 +26,14 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
 
     def set_atol(self):
         self.atol = 1e-3
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
         }
@@ -47,23 +41,20 @@ def set_feed(self):
     def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed.values()]
         self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_dtype = [x.dtype for x in self.feed.values()]
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
     def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
@@ -82,8 +73,8 @@ def _test_base(self, run_ipu=True):
             if run_ipu:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
@@ -103,27 +94,91 @@ def test_base(self):
         self.assertTrue(res0.shape == res1.shape)
 
 
-class TestCase1(TestBase):
-    def set_attrs(self):
+class TestCase2(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase3(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase4(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'float32'
+
+
+class TestCase5(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
+        }
+
+    def set_op_attrs(self):
+        self.attrs = {}
+        self.attrs['dtype'] = 'int32'
+
+
+class TestCase6(TestBase):
+    def set_atol(self):
+        self.atol = 1e-10
+
+    def set_data_feed(self):
+        self.feed = {
+            "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'),
+        }
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float16'
 
 
 @unittest.skip('float64 is not supported')
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float64'
 
 
 @unittest.skip('skip float16 to float32')
 class TestCase3(TestBase):
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'float32'
 
@@ -133,13 +188,13 @@ class TestCase4(TestBase):
     def set_atol(self):
         self.atol = 1
 
-    def set_feed(self):
+    def set_data_feed(self):
         self.feed = {
             "x": np.random.randint(
                 low=1, high=100, size=[1, 3, 3, 3]).astype('int32'),
         }
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['dtype'] = 'int8'
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
index 094b19ce99da9..c5a8090283940 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py
@@ -16,14 +16,9 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode,
+                                                          IPUOpTest)
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,81 +27,95 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        data1 = np.random.uniform(size=[1, 3, 10, 10])
+        data2 = np.random.uniform(size=[1, 3, 10, 10])
 
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
-            "y": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'),
+        self.feed_fp32 = {
+            'x': data1.astype(np.float32),
+            'y': data2.astype(np.float32)
+        }
+        self.feed_fp16 = {
+            'x': data1.astype(np.float16),
+            'y': data2.astype(np.float16)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 0}
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype='float32')
                 y = paddle.static.data(
                     name=self.feed_list[1],
                     shape=self.feed_shape[1],
-                    dtype=self.feed_dtype[1])
+                    dtype='float32')
+
                 out = paddle.fluid.layers.concat([x, y], **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
     def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
-
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(res0.shape == res1.shape)
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {"axis": 1}
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
index f28733de6b1a1..ade54fda86929 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py
@@ -16,13 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -31,20 +26,30 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed_shape = []
-        self.feed_shape.append([1, 3, 10, 10])
-
-        self.feed = {}
-        self.feed["in_0"] = np.random.uniform(
-            size=self.feed_shape[0]).astype(np.float32)
-
-        self.feed_list = list(self.feed.keys())
-
-    def set_attrs(self):
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_atol(self):
+        self.atol = 1e-6
+        self.rtol = 1e-6
+        self.atol_fp16 = 1e-3
+        self.rtol_fp16 = 1e-3
+
+    def set_data_feed(self):
+        data = np.random.uniform(size=[1, 3, 10, 10])
+        self.feed_fp32 = {'in_0': data.astype(np.float32)}
+        self.feed_fp16 = {'in_0': data.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+
+    def set_op_attrs(self):
         self.attrs = {}
         self.attrs['num_filters'] = 3
         self.attrs['filter_size'] = 3
@@ -54,104 +59,112 @@ def set_attrs(self):
         self.attrs['groups'] = 1
         self.attrs['data_format'] = 'NCHW'
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 image = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
                     dtype='float32')
+
                 out = paddle.fluid.layers.conv2d(image, **self.attrs)
 
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
             return result[0]
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['num_filters'] = 1
 
 
 class TestCase2(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 3]
 
 
 class TestCase2_1(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['filter_size'] = [3, 2]
 
 
 class TestCase3(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['stride'] = [2, 3]
 
 
 class TestCase4(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['dilation'] = [2, 2]
 
 
 class TestCase5(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['groups'] = 3
 
 
 class TestCase6(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = 2
 
 
 class TestCase7(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [2, 3]
 
 
 class TestCase8(TestBase):
-    def set_attrs(self):
-        super().set_attrs()
+    def set_op_attrs(self):
+        super().set_op_attrs()
         self.attrs['padding'] = [1, 2, 2, 3]
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
index 3987c6cd5b386..3a21f0cb0079c 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py
@@ -16,14 +16,8 @@
 
 import numpy as np
 import paddle
-import paddle.fluid as fluid
-import paddle.fluid.compiler as compiler
-import paddle.optimizer
 import paddle.static
-from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest,
-                                                          np_dtype_to_fluid_str)
-
-paddle.enable_static()
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
 
 
 @unittest.skipIf(not paddle.is_compiled_with_ipu(),
@@ -32,44 +26,54 @@ class TestBase(IPUOpTest):
     def setUp(self):
         self.set_atol()
         self.set_training()
-        self.set_feed()
+        self.set_data_feed()
         self.set_feed_attr()
-        self.set_attrs()
-
-    def set_feed(self):
-        self.feed = {
-            "x": np.random.uniform(size=[3, 7]).astype('float32'),
-            "label": np.arange(3).reshape([3]).astype(np.int64),
+        self.set_op_attrs()
+
+    @property
+    def fp16_enabled(self):
+        return True
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[3, 7])
+        label = np.arange(3).reshape([3, 1])
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
         }
 
     def set_feed_attr(self):
-        self.feed_shape = [x.shape for x in self.feed.values()]
-        self.feed_list = list(self.feed.keys())
-        self.feed_dtype = [
-            np_dtype_to_fluid_str(x.dtype) for x in self.feed.values()
-        ]
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
 
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': False, }
 
-    def _test_base(self, run_ipu=True):
-        scope = fluid.core.Scope()
+    def np_nll_loss(self):
+        tmp = -np.log(self.feed_fp32['x'])
+        label = self.feed_fp32['label']
+        indice = [range(label.shape[0]), label.flatten()]
+        self.np_ref = tmp[indice]
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
-        SEED = self.SEED
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
 
-        with fluid.scope_guard(scope):
+        with paddle.static.scope_guard(scope):
             with paddle.static.program_guard(main_prog, startup_prog):
                 x = paddle.static.data(
                     name=self.feed_list[0],
                     shape=self.feed_shape[0],
-                    dtype=self.feed_dtype[0])
+                    dtype="float32")
 
-                # [warning] Copying (host) tensor input/1 from INT64 to INT32.
-                #  Will only warn once
-                if run_ipu:
+                if exec_mode != ExecutionMode.CPU_FP32:
                     label = paddle.static.data(
                         name=self.feed_list[1],
                         shape=self.feed_shape[1],
@@ -80,52 +84,78 @@ def _test_base(self, run_ipu=True):
                         shape=self.feed_shape[1],
                         dtype='int64')
 
-                out = fluid.layers.cross_entropy(
+                out = paddle.fluid.layers.cross_entropy(
                     input=x, label=label, **self.attrs)
+
                 fetch_list = [out.name]
 
-            if run_ipu:
-                place = paddle.IPUPlace()
-            else:
+            if exec_mode == ExecutionMode.CPU_FP32:
                 place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
 
-            if run_ipu:
+            if exec_mode != ExecutionMode.CPU_FP32:
                 feed_list = self.feed_list
                 ipu_strategy = paddle.static.IpuStrategy()
-                ipu_strategy.SetGraphConfig(is_training=self.is_training)
-                program = compiler.IPUCompiledProgram(
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
                     main_prog,
                     ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
             else:
                 program = main_prog
 
-            result = exe.run(program, feed=self.feed, fetch_list=fetch_list)
-            return result[0]
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
 
-    def test_base(self):
-        res0 = self._test_base(True)
-        res1 = self._test_base(False)
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed['label'] = feed['label'].astype(np.int32)
 
-        self.assertTrue(
-            np.allclose(
-                res0.flatten(), res1.flatten(), atol=self.atol))
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
 
-        self.assertTrue(res0.shape == res1.shape)
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+            output_dict[mode] = self._test_base(mode).flatten()
+        self.np_nll_loss()
+
+        self.check(output_dict)
 
 
 class TestCase1(TestBase):
-    def set_attrs(self):
+    def set_op_attrs(self):
         self.attrs = {
             'soft_label': False,
             'ignore_index': 1,
         }
 
 
-@unittest.skip("soft_label=True id not supported")
 class TestCase2(TestBase):
-    def set_attrs(self):
+    def set_data_feed(self):
+        x = np.random.uniform(size=[30, 70])
+        label = np.arange(30).reshape([30, 1])
+
+        self.feed_fp32 = {
+            "x": x.astype(np.float32),
+            "label": label.astype(np.int64)
+        }
+        self.feed_fp16 = {
+            "x": x.astype(np.float16),
+            "label": label.astype(np.int32)
+        }
+
+
+@unittest.skip("soft_label=True is not supported")
+class TestCase3(TestBase):
+    def set_op_attrs(self):
         self.attrs = {'soft_label': True, }
 
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
new file mode 100644
index 0000000000000..2f1d86daf0057
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -0,0 +1,123 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle
+import paddle.static
+from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode
+
+
+@unittest.skipIf(not paddle.is_compiled_with_ipu(),
+                 "core is not compiled with IPU")
+class TestBase(IPUOpTest):
+    def setUp(self):
+        self.set_atol()
+        self.set_training()
+        self.set_data_feed()
+        self.set_feed_attr()
+        self.set_op_attrs()
+
+    # popart unsupport fp16 cumsum
+    @property
+    def fp16_enabled(self):
+        return False
+
+    def set_data_feed(self):
+        x = np.random.uniform(size=[1, 128])
+        self.feed_fp32 = {"x": x.astype(np.float32)}
+        self.feed_fp16 = {"x": x.astype(np.float16)}
+
+    def set_feed_attr(self):
+        self.feed_shape = [x.shape for x in self.feed_fp32.values()]
+        self.feed_list = list(self.feed_fp32.keys())
+        self.feed_dtype = [x.dtype for x in self.feed_fp32.values()]
+
+    def set_op_attrs(self):
+        self.attrs = {}
+
+    def _test_base(self, exec_mode):
+        scope = paddle.static.Scope()
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = self.SEED
+        startup_prog.random_seed = self.SEED
+
+        with paddle.static.scope_guard(scope):
+            with paddle.static.program_guard(main_prog, startup_prog):
+                x = paddle.static.data(
+                    name=self.feed_list[0],
+                    shape=self.feed_shape[0],
+                    dtype="float32")
+
+                out = paddle.fluid.layers.cumsum(x, **self.attrs)
+
+                fetch_list = [out.name]
+
+            if exec_mode == ExecutionMode.CPU_FP32:
+                place = paddle.CPUPlace()
+            else:
+                place = paddle.IPUPlace()
+
+            exe = paddle.static.Executor(place)
+            exe.run(startup_prog)
+
+            if exec_mode != ExecutionMode.CPU_FP32:
+                feed_list = self.feed_list
+                ipu_strategy = paddle.static.IpuStrategy()
+                ipu_strategy.set_graph_config(is_training=self.is_training)
+                if exec_mode == ExecutionMode.IPU_POPART_FP16:
+                    ipu_strategy.set_precision_config(enable_fp16=True)
+                program = paddle.static.IpuCompiledProgram(
+                    main_prog,
+                    ipu_strategy=ipu_strategy).compile(feed_list, fetch_list)
+            else:
+                program = main_prog
+
+            feed = self.feed_fp32
+            if exec_mode > ExecutionMode.IPU_FP32:
+                feed = self.feed_fp16
+
+            result = exe.run(program, feed=feed, fetch_list=fetch_list)
+            return result[0]
+
+    def test(self):
+        output_dict = {}
+        for mode in ExecutionMode:
+            if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled:
+                break
+
+            output_dict[mode] = self._test_base(mode).flatten()
+
+        self.check(output_dict)
+
+
+class TestCase1(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": False}
+
+
+class TestCase2(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": False, "reverse": True}
+
+
+class TestCase3(TestBase):
+    def set_op_attrs(self):
+        self.attrs = {"exclusive": True, "reverse": True}
+
+
+if __name__ == "__main__":
+    unittest.main()

From 6af2729e615a8d6b3b4f96964f1c71d20b8f5517 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Wed, 2 Mar 2022 15:45:28 +0800
Subject: [PATCH 10/41] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20gather=5Ft?=
 =?UTF-8?q?ree,reduce=5Fprod=20to=20phi=20(#39844)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* move to phi

* migrate gather_tree_op into phi

* move reduce_prod tp phi

* optimize code
---
 paddle/fluid/operators/gather_tree_op.cc      |  4 +-
 paddle/fluid/operators/gather_tree_op.cu      | 84 -------------------
 paddle/fluid/operators/gather_tree_op.h       | 66 ---------------
 .../operators/reduce_ops/reduce_prod_op.cc    | 10 +--
 .../operators/reduce_ops/reduce_prod_op.h     |  7 --
 paddle/phi/kernels/cpu/gather_tree_kernel.cc  | 62 ++++++++++++++
 paddle/phi/kernels/cpu/reduce_prod_kernel.cc  | 44 ++++++++++
 paddle/phi/kernels/funcs/reduce_functor.h     |  8 ++
 .../kernels/gather_tree_kernel.h}             | 21 +++--
 paddle/phi/kernels/gpu/gather_tree_kernel.cu  | 79 +++++++++++++++++
 paddle/phi/kernels/gpu/reduce_prod_kernel.cu  | 43 ++++++++++
 paddle/phi/kernels/reduce_prod_kernel.h       | 29 +++++++
 paddle/phi/ops/compat/reduce_sig.cc           |  6 ++
 13 files changed, 285 insertions(+), 178 deletions(-)
 delete mode 100644 paddle/fluid/operators/gather_tree_op.cu
 delete mode 100644 paddle/fluid/operators/gather_tree_op.h
 create mode 100644 paddle/phi/kernels/cpu/gather_tree_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc
 rename paddle/{fluid/operators/reduce_ops/reduce_prod_op.cu => phi/kernels/gather_tree_kernel.h} (51%)
 create mode 100644 paddle/phi/kernels/gpu/gather_tree_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu
 create mode 100644 paddle/phi/kernels/reduce_prod_kernel.h

diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 830134e57e0e7..2868c3697eda1 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather_tree_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -73,5 +73,3 @@ selected ids.
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
-REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel<int32_t>,
-                       ops::GatherTreeOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu
deleted file mode 100644
index 829682764a674..0000000000000
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather_tree_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-__global__ void GatherTree(const T *ids_data, const T *parents_data,
-                           T *out_data, const int64_t max_length,
-                           const int64_t batch_size, const int64_t beam_size) {
-  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
-    int batch = i / beam_size;
-    int beam = i % beam_size;
-    auto idx =
-        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
-    out_data[idx] = ids_data[idx];
-    auto parent = parents_data[idx];
-    for (int step = max_length - 2; step >= 0; step--) {
-      idx = step * batch_size * beam_size + batch * beam_size;
-      out_data[idx + beam] = ids_data[idx + parent];
-      parent = parents_data[idx + parent];
-    }
-  }
-}
-
-template <typename T>
-class GatherTreeOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    auto &ids_dims = ids->dims();
-    int64_t max_length = ids_dims[0];
-    int64_t batch_size = ids_dims[1];
-    int64_t beam_size = ids_dims[2];
-
-    auto &dev_ctx = ctx.cuda_device_context();
-
-    const int block = 512;
-    int max_threads =
-        std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
-                 batch_size * beam_size);
-    const int grid = std::max(max_threads / block, 1);
-    GatherTree<<<grid, block>>>(ids_data, parents_data, out_data, max_length,
-                                batch_size, beam_size);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel<int32_t>,
-                        ops::GatherTreeOpCUDAKernel<int64_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h
deleted file mode 100644
index e035a30e7954f..0000000000000
--- a/paddle/fluid/operators/gather_tree_op.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-
-template <typename T>
-class GatherTreeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *ids = ctx.Input<Tensor>("Ids");
-    auto *parents = ctx.Input<Tensor>("Parents");
-    auto *out = ctx.Output<Tensor>("Out");
-
-    const auto *ids_data = ids->data<T>();
-    const auto *parents_data = parents->data<T>();
-    auto *out_data = out->mutable_data<T>(ctx.GetPlace());
-
-    auto &ids_dims = ids->dims();
-    auto max_length = ids_dims[0];
-    auto batch_size = ids_dims[1];
-    auto beam_size = ids_dims[2];
-
-    PADDLE_ENFORCE_NOT_NULL(
-        ids_data, platform::errors::InvalidArgument(
-                      "Input(Ids) of gather_tree should not be null."));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        parents_data, platform::errors::InvalidArgument(
-                          "Input(Parents) of gather_tree should not be null."));
-
-    for (int batch = 0; batch < batch_size; batch++) {
-      for (int beam = 0; beam < beam_size; beam++) {
-        auto idx = (max_length - 1) * batch_size * beam_size +
-                   batch * beam_size + beam;
-        out_data[idx] = ids_data[idx];
-        auto parent = parents_data[idx];
-        for (int step = max_length - 2; step >= 0; step--) {
-          idx = step * batch_size * beam_size + batch * beam_size;
-          out_data[idx + beam] = ids_data[idx + parent];
-          parent = parents_data[idx + parent];
-        }
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 50df75d9ad3fd..eb745ab9c56c5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -27,15 +27,7 @@ class CPUDeviceContext;
 }  // namespace paddle
 
 REGISTER_REDUCE_OP(reduce_prod);
-REGISTER_OP_CPU_KERNEL(reduce_prod,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int, ops::ProdFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         int64_t, ops::ProdFunctor>);
+
 REGISTER_OP_CPU_KERNEL(reduce_prod_grad,
                        ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
                                              float, ops::ProdGradFunctor>,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
index 103e108e4bda1..60dedf8d6ffb0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h
@@ -19,13 +19,6 @@
 namespace paddle {
 namespace operators {
 
-struct ProdFunctor {
-  template <typename DeviceContext, typename X, typename Y, typename Dim>
-  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
-    y->device(place) = x->prod(dim);
-  }
-};
-
 struct ProdGradFunctor {
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
new file mode 100644
index 0000000000000..25fb870d851f6
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  auto &ids_dims = ids.dims();
+  auto max_length = ids_dims[0];
+  auto batch_size = ids_dims[1];
+  auto beam_size = ids_dims[2];
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  for (int batch = 0; batch < batch_size; batch++) {
+    for (int beam = 0; beam < beam_size; beam++) {
+      auto idx =
+          (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+      out_data[idx] = ids_data[idx];
+      auto parent = parents_data[idx];
+      for (int step = max_length - 2; step >= 0; step--) {
+        idx = step * batch_size * beam_size + batch * beam_size;
+        out_data[idx + beam] = ids_data[idx + parent];
+        parent = parents_data[idx + parent];
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, CPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
new file mode 100644
index 0000000000000..cf0179124ebdf
--- /dev/null
+++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/reduce_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<CPUContext, T, phi::funcs::ProdFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h
index ce8e095e8ac6c..aebd155ac59cb 100644
--- a/paddle/phi/kernels/funcs/reduce_functor.h
+++ b/paddle/phi/kernels/funcs/reduce_functor.h
@@ -33,5 +33,13 @@ struct MeanFunctor {
   }
 };
 
+//////// Prod Functor ///////
+struct ProdFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->prod(dim);
+  }
+};
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/phi/kernels/gather_tree_kernel.h
similarity index 51%
rename from paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
rename to paddle/phi/kernels/gather_tree_kernel.h
index 2de647df8b182..e5a1a684daef0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/phi/kernels/gather_tree_kernel.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
+#pragma once
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_prod,
-    ops::ReduceCudaKernel<float, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::MulFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::MulFunctor, kps::IdentityFunctor>);
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
new file mode 100644
index 0000000000000..a9e73ec37c8ed
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gather_tree_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GatherTree(const T *ids_data,
+                           const T *parents_data,
+                           T *out_data,
+                           const int64_t max_length,
+                           const int64_t batch_size,
+                           const int64_t beam_size) {
+  CUDA_KERNEL_LOOP(i, batch_size * beam_size) {
+    int batch = i / beam_size;
+    int beam = i % beam_size;
+    auto idx =
+        (max_length - 1) * batch_size * beam_size + batch * beam_size + beam;
+    out_data[idx] = ids_data[idx];
+    auto parent = parents_data[idx];
+    for (int step = max_length - 2; step >= 0; step--) {
+      idx = step * batch_size * beam_size + batch * beam_size;
+      out_data[idx + beam] = ids_data[idx + parent];
+      parent = parents_data[idx + parent];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  PADDLE_ENFORCE_NOT_NULL(ids_data,
+                          phi::errors::InvalidArgument(
+                              "Input(Ids) of gather_tree should not be null."));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      parents_data,
+      phi::errors::InvalidArgument(
+          "Input(Parents) of gather_tree should not be null."));
+
+  auto &ids_dims = ids.dims();
+  int64_t max_length = ids_dims[0];
+  int64_t batch_size = ids_dims[1];
+  int64_t beam_size = ids_dims[2];
+
+  const int block = 512;
+  int max_threads =
+      std::min(static_cast<int64_t>(dev_ctx.GetMaxPhysicalThreadCount()),
+               batch_size * beam_size);
+  const int grid = std::max(max_threads / block, 1);
+  GatherTree<<<grid, block>>>(
+      ids_data, parents_data, out_data, max_length, batch_size, beam_size);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    gather_tree, GPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {}
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
new file mode 100644
index 0000000000000..14084d0f4f3c6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/reduce_prod_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::MulFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reduce_prod,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ReduceProdKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h
new file mode 100644
index 0000000000000..5e92b6c4db14e
--- /dev/null
+++ b/paddle/phi/kernels/reduce_prod_kernel.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ReduceProdKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int64_t>& dims,
+                      bool keep_dim,
+                      bool reduce_all,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 6395486ed2b72..92839fb303075 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -51,6 +51,11 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("unregistered", {}, {}, {});
 }
 
+KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
@@ -58,3 +63,4 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 
 PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping);

From c9cd47d96b2cccb34d8dc269a055f5b64346a10e Mon Sep 17 00:00:00 2001
From: JZ-LIANG <jianzhongliang10@gmail.com>
Date: Wed, 2 Mar 2022 15:58:57 +0800
Subject: [PATCH 11/41] [Auto Parallel] Adapt Partitioner & DistOp for ERNIE3.0
 Inference and cache (#39895)

* adapot dist op

* add dist_fill_constant_batch_size_like

* remvoe print

* update compitable

* add unitest
---
 .../auto_parallel/operators/__init__.py       |   1 +
 .../auto_parallel/operators/dist_eltwise.py   |   0
 .../auto_parallel/operators/dist_embedding.py |   5 +-
 .../dist_fill_constant_batch_size_like.py     | 127 ++++++++++++++++++
 .../auto_parallel/operators/dist_matmul.py    |   8 +-
 .../distributed/auto_parallel/partitioner.py  |   3 +
 .../test_auto_parallel_while_op.py            |  28 ++++
 7 files changed, 168 insertions(+), 4 deletions(-)
 mode change 100755 => 100644 python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
 create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py

diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 9f84df2d89634..db6f909f8ca7d 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -27,3 +27,4 @@
 from . import dist_check_finite_and_unscale
 from . import dist_update_loss_scaling
 from . import dist_split
+from . import dist_fill_constant_batch_size_like
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py
old mode 100755
new mode 100644
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 94eb0d2d469f0..32f8e2acef5e1 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -155,7 +155,7 @@ def forward(ctx, *args, **kwargs):
             kwargs['Out'])
 
         Ids_var = main_block.var(kwargs['Ids'][0])
-        Weight_var = main_block.var(kwargs['W'][0])
+        Weight_var = main_block._var_recursive(kwargs['W'][0])
         Out_var = main_block.var(kwargs['Out'][0])
 
         # got dist attribute info
@@ -277,7 +277,8 @@ def forward(ctx, *args, **kwargs):
 
         # param initialization sync
         if Weight_var.is_parameter and not op_dist_attr.is_recompute:
-            assert Weight_var.name not in dist_op_context.already_init_sync_vars
+            if Weight_var.name in dist_op_context.already_init_sync_vars:
+                return
             dist_op_context.already_init_sync_vars.add(Weight_var.name)
             param = startup_block.var(Weight_var.name)
             param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
new file mode 100644
index 0000000000000..0c9d9eda02e1b
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -0,0 +1,127 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from ..utils import is_dim_shard
+from ..utils import is_dim_replicate
+from ..utils import is_valid_list_index
+from ..utils import compute_compatible_dim_mapping
+from ..utils import compute_compatible_dims_mapping
+from ..utils import compute_compatible_and_update_dim_mapping
+from ..utils import set_dist_op_desc_original_id
+from paddle.fluid import core, unique_name
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.framework import Program, Parameter, Variable, program_guard
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(
+    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like"))
+
+
+class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
+    def __init__(self, name):
+        super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        shape_list = op_desc.attr("shape")
+
+        if len(shape_list) != len(out_dims_mapping):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        in_name = op_desc.input('Input')[0]
+        in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name)
+
+        # the dim_mapping of batch dimension should be the same
+        return out_dims_mapping[0] == in_dims_mapping[0]
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        # only the batch size dimemsion of input and output are relative.
+        dim_changed = compute_compatible_and_update_dim_mapping(
+            [x_dims_mapping, out_dims_mapping], [0, 0])
+        if dim_changed:
+            changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        """
+        kwargs: inputname_mapping & outputname_mapping
+        """
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+        dist_op_context = ctx.dist_op_context
+        src_op = dist_op_context.cur_src_op
+        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+        main_block = dist_op_context.work_block
+        op = main_block.ops[-1]
+        assert op.type == "fill_constant_batch_size_like"
+
+        # modify shape attr according to how output are partitioned
+        out_name = op.output('Out')[0]
+        dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+        process_mesh_shape = op_dist_attr.process_mesh.topology
+        shape_list = op.attr("shape")
+        # modify target shape
+        for idx, axis in enumerate(dims_mapping):
+            if axis >= 0:
+                shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
+
+        op._set_attr("shape", shape_list)
+        main_block._sync_with_cpp()
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl(
+    "fill_constant_batch_size_like",
+    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 9eb24a65e608c..058ae1d0a9fd5 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs):
 
 def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id):
 
-    assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format(
-        Weight_var.name, dist_op_context.already_init_sync_vars)
+    if Weight_var.name in dist_op_context.already_init_sync_vars:
+        return
     assert startup_block.has_var(Weight_var.name)
     dist_op_context.already_init_sync_vars.add(Weight_var.name)
     param = startup_block.var(Weight_var.name)
@@ -819,6 +819,8 @@ def forward(ctx, *args, **kwargs):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
@@ -1323,6 +1325,8 @@ def forward(ctx, *args, **kwargs):
                                 out_var_dist_attr)
 
         intermediate_var_0 = main_block.create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                ["c_allreduce_sum", 'tmp'])),
             shape=Out_var.shape,
             dtype=Out_var.dtype,
             type=Out_var.type,
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 2f88407c093a5..ed5ec85d84f22 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr):
     var_shape = var.shape
     mapping = dist_attr.dims_mapping
     mesh = dist_attr.process_mesh.topology
+    if mapping == []:
+        return var_shape
+
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
index 1cd8f8f3e7083..07e6a2c4346da 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py
@@ -174,6 +174,7 @@ def get_program():
             dtype='float32')
         label = static.data(
             name="label", shape=[batch_size, sequence_len, 1], dtype='float32')
+
         data_holder = [input, label]
         # dataloader
         dataloader = paddle.io.DataLoader.from_generator(
@@ -194,6 +195,17 @@ def get_program():
                 "dims_mapping": [-1, -1, -1]
             })
 
+        # fill constant bsz like
+        tmp = paddle.fluid.layers.fill_constant_batch_size_like(
+            input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
+        auto.shard_tensor(
+            tmp,
+            dist_attr={
+                "process_mesh": _g_process_mesh,
+                "dims_mapping": [-1, 0, -1, -1]
+            })
+
+        # model
         mlp_start = MLPLayer(
             hidden_size=hidden_size,
             intermediate_size=4 * hidden_size,
@@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context):
                         op_dist_attr.impl_idx = 0
                     else:
                         op_dist_attr.impl_idx = 1
+            elif op.type == "fill_constant_batch_size_like":
+                op_dist_attr.impl_type = "fill_constant_batch_size_like"
+                op_dist_attr.impl_idx = 0
             else:
                 op_dist_attr.impl_type = "default"
                 op_dist_attr.impl_idx = 0
@@ -428,6 +443,12 @@ def test_partitioner(self):
         dist_main_prog, dist_startup_prog = partition(
             train_program, start_program, dist_context)
         global_block_ops = dist_main_prog.blocks[0].ops
+
+        fill_op = None
+        for op in global_block_ops:
+            if op.type == "fill_constant_batch_size_like":
+                fill_op = op
+
         global_block_ops = [op.type for op in global_block_ops]
         sub_block_ops = dist_main_prog.blocks[1].ops
         sub_block_ops = [op.type for op in sub_block_ops]
@@ -435,6 +456,13 @@ def test_partitioner(self):
         self.assertTrue("c_allreduce_sum" in global_block_ops)
         self.assertTrue("c_allreduce_sum" in sub_block_ops)
 
+        # test fill_constant_batch_size_like
+
+        self.assertTrue(fill_op is not None)
+        ref_shape = [-1, 8, 0, 48]
+        shape = fill_op.attr("shape")
+        self.assertTrue(ref_shape == shape)
+
 
 if __name__ == "__main__":
     unittest.main()

From 4a4215ffad5efada31dcdae9262a806635b1f226 Mon Sep 17 00:00:00 2001
From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com>
Date: Wed, 2 Mar 2022 16:14:31 +0800
Subject: [PATCH 12/41]  [bf16] add bf16 kernel: softmax & log_softmax (#39999)

* add softmax log_softmax

* refine rocm

* refine unittest
---
 paddle/fluid/operators/log_softmax_op.cu      | 16 ++--
 paddle/fluid/operators/math/softmax.cu        | 13 +++
 paddle/fluid/operators/math/softmax_impl.h    | 91 +++++++++++++++++++
 .../platform/device/gpu/rocm/miopen_helper.h  | 17 ++++
 paddle/phi/common/amp_type_traits.h           | 42 +++++++++
 paddle/phi/common/bfloat16.h                  | 18 ++--
 paddle/phi/common/float16.h                   | 12 ---
 paddle/phi/kernels/gpu/softmax_grad_kernel.cu |  4 +-
 paddle/phi/kernels/gpu/softmax_kernel.cu      |  4 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    | 12 +++
 .../gpudnn/softmax_grad_kernel_gpudnn.cu      | 14 ++-
 .../kernels/gpudnn/softmax_kernel_gpudnn.cu   | 14 ++-
 .../fluid/tests/unittests/test_log_softmax.py | 30 +++++-
 .../fluid/tests/unittests/test_softmax_op.py  | 52 ++++++++++-
 14 files changed, 305 insertions(+), 34 deletions(-)
 create mode 100644 paddle/phi/common/amp_type_traits.h

diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index 034e67568b34c..8770abdac838f 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #include <limits>
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/log_softmax_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
@@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data,
 template <typename T>
 class LogSoftmaxKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output,
 template <typename T>
 class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
-  using MPDType = typename details::MPTypeTrait<T>::Type;
+  using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
 
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -468,16 +468,18 @@ class LogSoftmaxGradKernel<platform::CUDADeviceContext, T>
   }
 };
 
-}  // operators
-}  // paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
     log_softmax, ops::LogSoftmaxKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxKernel<plat::CUDADeviceContext, plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(
     log_softmax_grad, ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, float>,
     ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>,
+    ops::LogSoftmaxGradKernel<plat::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index fd879e9e6ffe7..83b124902ebb7 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<platform::float16>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<platform::bfloat16>;
+template class SoftmaxGradCUDNNFunctor<platform::bfloat16>;
+#endif
 
 // MIOPEN do not support double
 #ifndef PADDLE_WITH_HIP
@@ -131,6 +135,10 @@ template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16,
                               true>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              false>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, platform::bfloat16,
+                              true>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double, false>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float, true>;
@@ -139,9 +147,13 @@ template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext,
                                   platform::float16>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext,
+                                  platform::bfloat16>;
 
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, false>;
 template class SoftmaxFunctor<phi::GPUContext, platform::float16, true>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, false>;
+template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16, true>;
 template class SoftmaxFunctor<phi::GPUContext, float, false>;
 template class SoftmaxFunctor<phi::GPUContext, double, false>;
 template class SoftmaxFunctor<phi::GPUContext, float, true>;
@@ -149,6 +161,7 @@ template class SoftmaxFunctor<phi::GPUContext, double, true>;
 template class SoftmaxGradFunctor<phi::GPUContext, float>;
 template class SoftmaxGradFunctor<phi::GPUContext, double>;
 template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h
index d51d638e0c19f..9833b4447ec45 100644
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -156,6 +156,65 @@ class SoftmaxEigen<DeviceContext, platform::float16, is_test> {
   }
 };
 
+template <typename DeviceContext, bool is_test>
+class SoftmaxEigen<DeviceContext, platform::bfloat16, is_test> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* X, framework::Tensor* Y) {
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+    constexpr int kAxisDim = 1;
+
+    auto logits = EigenMatrix<platform::bfloat16>::From(*X);
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_axis(kAxisDim);
+    Eigen::DSizes<int, 2> batch_classes(batch_size, num_classes);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_one_remain(batch_size, 1, num_remain);
+    Eigen::DSizes<int, 3> one_axis_one(1, axis_dim, 1);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+
+    // For numerical stability, logits should be shifted by maximum number along
+    // axis, calculate shifted_logits into softmax tensor for memory reuse.
+    if (num_remain == 1) {
+      // axis == -1, axis and class in same dimension, calculate along
+      // class dimension directly for higher performance
+      softmax.device(*context.eigen_device()) =
+          (logits -
+           logits.maximum(along_axis)
+               .reshape(batch_by_one)
+               .broadcast(one_by_class))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    } else {
+      // axis != -1, class dimension split into (axis, remain), max and sum
+      // should be calculated along axis dimension
+      softmax.device(*context.eigen_device()) =
+          (logits.reshape(batch_axis_remain) -
+           logits.reshape(batch_axis_remain)
+               .maximum(along_axis)
+               .reshape(batch_one_remain)
+               .broadcast(one_axis_one)
+               .reshape(batch_classes))
+              .unaryExpr(ValueClip<platform::bfloat16>());
+    }
+
+    softmax.device(*context.eigen_device()) = softmax.exp();
+    softmax.device(*context.eigen_device()) =
+        (softmax *
+         softmax.reshape(batch_axis_remain)
+             .sum(along_axis)
+             .inverse()
+             .broadcast(one_axis));
+  }
+};
+
 template <typename DeviceContext, typename T, bool is_test, typename Enable>
 void SoftmaxFunctor<DeviceContext, T, is_test, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
@@ -289,6 +348,38 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
   }
 };
 
+template <typename DeviceContext>
+class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
+ public:
+  void operator()(const DeviceContext& context, const int axis_dim,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
+    auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
+
+    constexpr int kBatchDim = 0;
+    constexpr int kClassDim = 1;
+
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+    const int num_remain = num_classes / axis_dim;
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
+    Eigen::DSizes<int, 2> one_axis(1, axis_dim);
+
+    auto dot = (softmax * softmax_grad)
+                   .reshape(batch_axis_remain)
+                   .sum(along_class)
+                   .broadcast(one_axis);
+    logits_grad.device(*context.eigen_device()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
+
 template <typename DeviceContext, typename T, typename Enable>
 void SoftmaxGradFunctor<DeviceContext, T, Enable>::operator()(
     const DeviceContext& context, const int axis_dim,
diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
index 34b9d57e055d5..1a514d2aca267 100644
--- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h
@@ -140,6 +140,23 @@ class CudnnDataType<float16> {
   }
 };
 
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const miopenDataType_t type = miopenBFloat16;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float> {
  public:
diff --git a/paddle/phi/common/amp_type_traits.h b/paddle/phi/common/amp_type_traits.h
new file mode 100644
index 0000000000000..ce3a469f5aedd
--- /dev/null
+++ b/paddle/phi/common/amp_type_traits.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+
+namespace phi {
+namespace dtype {
+
+template <typename T>
+class MPTypeTrait {
+ public:
+  using Type = T;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::float16> {
+ public:
+  using Type = float;
+};
+
+template <>
+class MPTypeTrait<phi::dtype::bfloat16> {
+ public:
+  using Type = float;
+};
+
+}  // namespace dtype
+}  // namespace phi
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 3fd8eb1b2684a..cf99bb8f19af0 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -377,31 +377,31 @@ struct numeric_limits<phi::dtype::bfloat16> {
   static const bool traps = true;
   static const bool tinyness_before = false;
 
-  static phi::dtype::bfloat16(min)() {
+  HOSTDEVICE static phi::dtype::bfloat16(min)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x007f);
   }
-  static phi::dtype::bfloat16 lowest() {
+  HOSTDEVICE static phi::dtype::bfloat16 lowest() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff7f);
   }
-  static phi::dtype::bfloat16(max)() {
+  HOSTDEVICE static phi::dtype::bfloat16(max)() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f7f);
   }
-  static phi::dtype::bfloat16 epsilon() {
+  HOSTDEVICE static phi::dtype::bfloat16 epsilon() {
     return phi::dtype::raw_uint16_to_bfloat16(0x3400);
   }
-  static phi::dtype::bfloat16 round_error() {
+  HOSTDEVICE static phi::dtype::bfloat16 round_error() {
     return phi::dtype::bfloat16(0.5);
   }
-  static phi::dtype::bfloat16 infinity() {
+  HOSTDEVICE static phi::dtype::bfloat16 infinity() {
     return phi::dtype::raw_uint16_to_bfloat16(0x7f80);
   }
-  static phi::dtype::bfloat16 quiet_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xffc1);
   }
-  static phi::dtype::bfloat16 signaling_NaN() {
+  HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() {
     return phi::dtype::raw_uint16_to_bfloat16(0xff81);
   }
-  static phi::dtype::bfloat16 denorm_min() {
+  HOSTDEVICE static phi::dtype::bfloat16 denorm_min() {
     return phi::dtype::raw_uint16_to_bfloat16(0x0001);
   }
 };
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 6ed9c88d70510..1cdcdef2c12ee 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) {
   return os;
 }
 
-template <typename T>
-class MPTypeTrait {
- public:
-  using Type = T;
-};
-
-template <>
-class MPTypeTrait<float16> {
- public:
-  using Type = float;
-};
-
 }  // namespace dtype
 }  // namespace phi
 
diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
index aa496d3cd391b..04052e0dfc39a 100644
--- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad,
                    phi::SoftmaxGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu
index 32efb9b776419..03c5714b96784 100644
--- a/paddle/phi/kernels/gpu/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/softmax_kernel.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/softmax_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/softmax_kernel_impl.h"
@@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax,
                    phi::SoftmaxRawKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 45798b88bb58a..c9c549379bbce 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
@@ -47,6 +49,11 @@ class VecT4<phi::dtype::float16> {
  public:
   using Type = int2;
 };
+template <>
+class VecT4<phi::dtype::bfloat16> {
+ public:
+  using Type = int2;
+};
 
 // Vectorization trait 2 * sizeof(T)
 template <typename T>
@@ -66,6 +73,11 @@ class VecT2<phi::dtype::float16> {
  public:
   using Type = int;
 };
+template <>
+class VecT2<phi::dtype::bfloat16> {
+ public:
+  using Type = int;
+};
 
 static inline int log2_ceil(int value) {
   int log2_value = 0;
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
index 56e5fef6e37e4..45ab645d37367 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu
@@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad,
                    ALL_LAYOUT,
                    phi::SoftmaxGradGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax_grad,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxGradGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
@@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
index 427d1729a13a8..7685c7dbb6894 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu
@@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax,
                    ALL_LAYOUT,
                    phi::SoftmaxRawGPUDNNKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+PD_REGISTER_KERNEL(softmax,
+                   GPUDNN,
+                   ALL_LAYOUT,
+                   phi::SoftmaxRawGPUDNNKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 #else
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
@@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax,
                    double,
                    phi::dtype::float16) {}
 #endif
+#endif
diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py
index d1437ca9c96f1..16f954708d4d4 100644
--- a/python/paddle/fluid/tests/unittests/test_log_softmax.py
+++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py
@@ -14,8 +14,9 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 import paddle
+import paddle.fluid.core as core
 import paddle.nn.functional as F
 
 np.random.seed(10)
@@ -74,6 +75,33 @@ def set_attrs(self):
         self.axis = 1
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLogSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = 'log_softmax'
+        self.dtype = np.uint16
+        self.shape = [2, 3, 4, 5]
+        self.axis = -1
+
+        x = np.random.uniform(0.1, 1., self.shape).astype(np.float32)
+        out = np.apply_along_axis(ref_log_softmax, self.axis, x)
+        self.x_grad = ref_log_softmax_grad(x, self.axis)
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {'axis': self.axis}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], ['Out'], user_defined_grads=[self.x_grad])
+
+
 class TestNNLogSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.x_shape = [2, 3, 4, 5]
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index a1cbefa40f307..4f1c37a242474 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
@@ -296,6 +296,56 @@ def get_x_shape(self):
         return [2, 3, 4, 5]
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.use_cudnn = self.init_cudnn()
+        self.use_mkldnn = False
+        self.dtype = np.uint16
+        self.shape = [10, 10]
+        self.axis = -1
+
+        np.random.seed(0)
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float32)
+        out = np.apply_along_axis(stable_softmax, self.axis, x)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x))
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+        self.attrs = {
+            'axis': self.axis,
+            'use_cudnn': self.use_cudnn,
+            'use_mkldnn': self.use_mkldnn
+        }
+
+    def init_cudnn(self):
+        return False
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(
+            place, check_dygraph=(self.use_mkldnn == False))
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ["X"],
+            "Out",
+            numeric_grad_delta=0.05,
+            check_dygraph=(self.use_mkldnn == False))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+    "core is not compiled with CUDA and cudnn version need larger than 8.1.0")
+class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op):
+    def init_cudnn(self):
+        return True
+
+
 class TestSoftmaxAPI(unittest.TestCase):
     def setUp(self):
         self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(

From 07dad6d6ec415758d520e33960a0c53e50ef2ab5 Mon Sep 17 00:00:00 2001
From: huzhiqiang <912790387@qq.com>
Date: Wed, 2 Mar 2022 02:16:04 -0600
Subject: [PATCH 13/41] [Infrt]add phi kernel dialect (#39726)

---
 .gitignore                                    |   3 +
 .../pybind/kernel_signature_generator.cc      |  26 +-
 paddle/infrt/dialect/infrt/common_type.h      |  18 +-
 paddle/infrt/dialect/infrt/infrt_ops_base.td  |   7 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |   4 +
 paddle/infrt/dialect/phi/ir/CMakeLists.txt    |   7 +-
 .../infrt/dialect/phi/ir/infrt_phi_kernel.td  |  24 +-
 .../infrt/dialect/phi/ir/infrt_phi_tensor.td  |  11 +-
 paddle/infrt/dialect/phi/ir/phi_kernels.cc    |  44 +++
 paddle/infrt/dialect/phi/ir/phi_kernels.h     |  42 +++
 .../infrt/dialect/phi/pass/kernel_op_desc.cc  |  45 ++-
 paddle/infrt/host_context/mlir_exec.cc        |   2 +
 paddle/infrt/kernel/phi/context_kernels.cc    |   8 +-
 paddle/infrt/kernel/phi/context_kernels.h     |   3 +-
 .../infrt/kernel/phi/dense_tensor_kernels.cc  |  34 ++-
 .../infrt/kernel/phi/dense_tensor_kernels.h   |   3 +-
 .../infershaped/infershape_launchers_test.cc  |   2 +-
 paddle/infrt/kernel/phi/registry.cc           |   2 +
 .../tests/dialect/pten/dense_tensor.mlir      |  12 +-
 paddle/scripts/infrt_build.sh                 |   4 +-
 tools/infrt/generate_phi_kernel_dialect.py    | 276 ++++++++++++++++++
 tools/infrt/get_phi_kernel_info.py            |  12 +-
 22 files changed, 536 insertions(+), 53 deletions(-)
 create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.cc
 create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.h
 create mode 100644 tools/infrt/generate_phi_kernel_dialect.py

diff --git a/.gitignore b/.gitignore
index cecd6fa91c754..debec551d9cd7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,9 @@ tools/__pycache__
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
 paddle/infrt/dialect/pd_ops.td
+paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
+paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
+tools/infrt/kernels.json
 paddle/infrt/dialect/pd_ops_info.h
 .lit_test_times.txt
 paddle/infrt/tests/dialect/Output
diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc
index 8283a249ded4c..f0d5a4e477fe4 100644
--- a/paddle/fluid/pybind/kernel_signature_generator.cc
+++ b/paddle/fluid/pybind/kernel_signature_generator.cc
@@ -49,24 +49,30 @@ int main(int argc, char **argv) {
     if (kernel_signature_map.Has(op_kernel_pair.first)) {
       std::cout << "\"" << op_kernel_pair.first << "\":{";
       auto &args = kernel_signature_map.Get(op_kernel_pair.first).args;
+
       std::cout << "\"inputs\":[";
-      for (auto name : std::get<0>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto inputs_ = std::get<0>(args);
+      if (inputs_.size() > 0) std::cout << inputs_[0];
+      for (size_t i = 1; i < inputs_.size(); i++) {
+        std::cout << ",\"" << inputs_[i] << "\"";
       }
-      if (std::get<0>(args).size() > 0) std::cout << "\b";
+
       std::cout << "],\"attrs\":[";
-      for (auto name : std::get<1>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto attrs_ = std::get<1>(args);
+      if (attrs_.size() > 0) std::cout << attrs_[0];
+      for (size_t i = 1; i < attrs_.size(); i++) {
+        std::cout << ",\"" << attrs_[i] << "\"";
       }
-      if (std::get<1>(args).size() > 0) std::cout << "\b";
+
       std::cout << "],\"outputs\":[";
-      for (auto name : std::get<2>(args)) {
-        std::cout << "\"" << name << "\",";
+      auto outputs_ = std::get<2>(args);
+      for (size_t i = 1; i < outputs_.size(); i++) {
+        std::cout << ",\"" << outputs_[i] << "\"";
       }
-      if (std::get<2>(args).size() > 0) std::cout << "\b";
+
       std::cout << "]},";
     }
   }
-  std::cout << "\b}" << std::endl;
+  std::cout << "}" << std::endl;
   return 0;
 }
diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h
index d6d6503c03be5..436e7920ca5c6 100644
--- a/paddle/infrt/dialect/infrt/common_type.h
+++ b/paddle/infrt/dialect/infrt/common_type.h
@@ -21,8 +21,22 @@
 namespace infrt {
 
 enum class TargetType : uint8_t { CPU, GPU, UNK };
-enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK };
-enum class LayoutType : uint8_t { NCHW, NHWC, UNK };
+enum class LayoutType : uint8_t { NCHW, NHWC, ANY, UNK };
+enum class PrecisionType : uint8_t {
+  UINT8,
+  INT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  BFLOAT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  BOOL,
+  UNK
+};
 
 struct Place {
   TargetType target;
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 978b126d75416..f19912dc0cd59 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -34,9 +34,10 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   let summary = "infrt dense tensor";
   let description = [{dense_tensor<, 3>}];
   let parameters = (ins
-    "TargetType":$target,
-    "PrecisionType":$precision,
-    "LayoutType":$layout
+    "::infrt::TargetType":$target,
+    "::infrt::PrecisionType":$precision,
+    "::infrt::LayoutType":$layout
+
   );
 }
 
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index c5c81b4b0f22d..5eae01719361d 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -23,6 +23,8 @@
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 #include "paddle/infrt/dialect/phi/ir/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
@@ -34,6 +36,8 @@ void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
                   mlir::pd::PaddleDialect,
 #ifdef INFRT_WITH_PHI
                   phi::PHIDenseTensorDialect,
+                  phi::PHICPUKernelDialect,
+                  phi::PHIGPUKernelDialect,
                   phi::PHIDialect
 #endif
                   >();
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
index 8c1d75629d09c..0497b9832118f 100644
--- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -1,9 +1,12 @@
 #mlir_tablegen_on(infrt_phi_base DIALECT phi)
 add_mlir_dialect(infrt_phi_base phi)
 add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
+add_mlir_dialect(phi_cpu_kernels phi_cpu)
+add_mlir_dialect(phi_gpu_kernels phi_gpu)
+
 #mlir_tablegen_on(infrt_phi_tensor)
 
 gather_srcs(infrt_src SRCS
     phi_base.cc 
-    infrt_phi_tensor.cc)
+    infrt_phi_tensor.cc
+    phi_kernels.cc)
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index 37bf0b5ef213d..ee23470fc754a 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -6,24 +6,32 @@ include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
 include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
-def PHI_KernelDialect : Dialect {
-  let name = "phi_kernel";
+def PHI_CPUKernelDialect : Dialect {
+  let name = "phi_cpu";
 
   let description = [{
-    The PHI Kernel dialect.
+    The PHI CPU Kernel dialect.
+  }];
+
+  let cppNamespace = "::infrt::phi";
+}
+
+def PHI_GPUKernelDialect : Dialect {
+  let name = "phi_gpu";
+
+  let description = [{
+    The PHI GPU Kernel dialect.
   }];
 
   let cppNamespace = "::infrt::phi";
 }
 
 // PHI Kernel related ops.
-class PDT_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_KernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
+class PDTCPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_CPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
-def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> {
-  let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x);
-  let results = (outs DenseTensor:$output);
+// PHI Kernel related ops.
+class PDTGPU_Kernel<string mnemonic, list<OpTrait> traits = []> : Op<PHI_GPUKernelDialect, mnemonic, !listconcat(traits, [PhiOpTrait, IsolatedFromAbove])> {
 }
 
 #endif
-
diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index dc3a4b340d767..39677871ff8fe 100644
--- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -34,6 +34,14 @@ class FillDenseTensorOp<Attr attr_type, string dtype> :
       attr_type:$value
   );
   let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
+}
+
+class PrintDenseTensorOp:
+      PDT_Op<"print_tensor"> {
+  let arguments = (ins DenseTensor:$input);
+  let results = (outs);
+  let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict";
 }
 
 class CreateCPUAllocatorOp
@@ -44,7 +52,7 @@ class CreateCPUAllocatorOp
 
 class CreateCPUContextOp
       : PDT_Op<"create_context." # "cpu", [NoSideEffect]> {
-  let arguments = (ins);
+  let arguments = (ins CPU_Allocator:$input);
   let results = (outs CPU_Context:$output);
 }
 
@@ -52,6 +60,7 @@ def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nc
 def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp<F32ArrayAttr, "f32">;
 def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp;
 def PDT_CreateContextOp_cpu : CreateCPUContextOp;
+def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp;
 
 def FakeKernelOp : PDT_Op<"fake_phi_kernel"> {
   let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y);
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
new file mode 100644
index 0000000000000..c7a837b83fc24
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/infrt/dialect/phi/ir/phi_kernels.h"
+#include <mlir/IR/BuiltinTypes.h>
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+
+namespace infrt {
+namespace phi {
+
+void PHICPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+void PHIGPUKernelDialect::initialize() {
+#define GET_OP_LIST
+  addOperations<
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc"  // NOLINT
+      >();
+}
+
+}  // namespace phi
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h
new file mode 100644
index 0000000000000..b84d1b2b7294b
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
+
+#include "paddle/infrt/dialect/dense_tensor.h"
+#include "paddle/infrt/dialect/infrt/infrt_dialect.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
+
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc"
+
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc"
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc"
diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
index 63869b7d7b9ea..6c0f6df892100 100644
--- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
+++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc
@@ -41,26 +41,49 @@ TargetType cvtTargetFromPhi(phi::Backend backend) {
 }
 
 phi::DataType cvtPrecision2Phi(PrecisionType precision) {
+#define CONVERT_PRECISION_TO_PHI(Precision) \
+  case PrecisionType::Precision:            \
+    return phi::DataType::Precision;
+
   switch (precision) {
-    case PrecisionType::FLOAT32:
-      return phi::DataType::FLOAT32;
-      break;
-    case PrecisionType::FLOAT16:
-      return phi::DataType::FLOAT16;
+    CONVERT_PRECISION_TO_PHI(FLOAT32)
+    CONVERT_PRECISION_TO_PHI(FLOAT16)
+    CONVERT_PRECISION_TO_PHI(FLOAT64)
+    CONVERT_PRECISION_TO_PHI(UINT8)
+    CONVERT_PRECISION_TO_PHI(INT8)
+    CONVERT_PRECISION_TO_PHI(INT16)
+    CONVERT_PRECISION_TO_PHI(INT32)
+    CONVERT_PRECISION_TO_PHI(INT64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX64)
+    CONVERT_PRECISION_TO_PHI(COMPLEX128)
+    CONVERT_PRECISION_TO_PHI(BOOL)
     default:
       return phi::DataType::UNDEFINED;
   }
+#undef CONVERT_PRECISION_TO_PHI
 }
 
 PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) {
+#define CONVERT_PRECISION_FROM_PHI(Precision) \
+  case phi::DataType::Precision:              \
+    return PrecisionType::Precision;
+
   switch (datatype) {
-    case phi::DataType::FLOAT32:
-      return PrecisionType::FLOAT32;
-    case phi::DataType::FLOAT16:
-      return PrecisionType::FLOAT16;
+    CONVERT_PRECISION_FROM_PHI(FLOAT32)
+    CONVERT_PRECISION_FROM_PHI(FLOAT16)
+    CONVERT_PRECISION_FROM_PHI(FLOAT64)
+    CONVERT_PRECISION_FROM_PHI(UINT8)
+    CONVERT_PRECISION_FROM_PHI(INT8)
+    CONVERT_PRECISION_FROM_PHI(INT16)
+    CONVERT_PRECISION_FROM_PHI(INT32)
+    CONVERT_PRECISION_FROM_PHI(INT64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX64)
+    CONVERT_PRECISION_FROM_PHI(COMPLEX128)
+    CONVERT_PRECISION_FROM_PHI(BOOL)
     default:
       return PrecisionType::UNK;
   }
+#undef CONVERT_PRECISION_FROM_PHI
 }
 
 phi::DataLayout cvtLayout2Phi(LayoutType layout) {
@@ -69,6 +92,8 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) {
       return phi::DataLayout::NCHW;
     case LayoutType::NHWC:
       return phi::DataLayout::NHWC;
+    case LayoutType::ANY:
+      return phi::DataLayout::ANY;
     default:
       return phi::DataLayout::UNDEFINED;
   }
@@ -80,6 +105,8 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) {
       return LayoutType::NCHW;
     case phi::DataLayout::NHWC:
       return LayoutType::NHWC;
+    case phi::DataLayout::ANY:
+      return LayoutType::ANY;
     default:
       return LayoutType::UNK;
   }
diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc
index 79717ba2cc034..7823681079f67 100644
--- a/paddle/infrt/host_context/mlir_exec.cc
+++ b/paddle/infrt/host_context/mlir_exec.cc
@@ -29,6 +29,7 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 #ifdef INFRT_WITH_PHI
+#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h"
 #include "paddle/infrt/kernel/phi/registry.h"
 #endif
 
@@ -58,6 +59,7 @@ int main(int argc, char** argv) {
   kernel::RegisterControlFlowKernels(&registry);
 #ifdef INFRT_WITH_PHI
   kernel::RegisterPhiKernels(&registry);
+  kernel::RegisterInferShapeLaunchers(&registry);
 #endif
 
   // load extra shared library
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 5284f499916c3..3caaf1788e3f8 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -18,7 +18,13 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext() { return {}; }
+::phi::CPUContext CreateCpuContext(
+    infrt::backends::CpuPhiAllocator* allocator) {
+  ::phi::CPUContext context;
+  context.SetAllocator(allocator);
+  context.Init();
+  return context;
+}
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h
index 8082dc6c2ff29..7f1e7ef6cd356 100644
--- a/paddle/infrt/kernel/phi/context_kernels.h
+++ b/paddle/infrt/kernel/phi/context_kernels.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include "paddle/infrt/backends/host/phi_allocator.h"
 #include "paddle/infrt/backends/host/phi_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 
@@ -21,7 +22,7 @@ namespace infrt {
 namespace kernel {
 namespace phi {
 
-::phi::CPUContext CreateCpuContext();
+::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*);
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
index ce9200b9918c0..871336e8762e8 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h"
-
+#include <iostream>
 namespace infrt {
 namespace kernel {
 namespace phi {
@@ -30,8 +30,38 @@ ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
 }
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values) {}
+                        host_context::Attribute<std::vector<float>> values) {
+  auto place = ::phi::CPUPlace();
+  float* a_data = dense_tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < dense_tensor->numel(); ++i) {
+    a_data[i] = (values.get())[i];
+  }
+}
 
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor) {
+#define PRINT_META_DATA(PHI_DATATYPE, DTYPE)              \
+  case ::phi::DataType::PHI_DATATYPE: {                   \
+    DTYPE* data = dense_tensor->data<DTYPE>();            \
+    if (dense_tensor->numel() == 0) break;                \
+    std::cout << data[0];                                 \
+    for (int64_t i = 1; i < dense_tensor->numel(); i++) { \
+      std::cout << "," << data[i];                        \
+    }                                                     \
+    break;                                                \
+  }
+
+  ::phi::DDim dims = dense_tensor->dims();
+  std::cout << "dense_tensor: shape=shape" << dims.to_str() << ","
+            << " values=[";
+  switch (dense_tensor->dtype()) {
+    PRINT_META_DATA(FLOAT32, float);
+    PRINT_META_DATA(INT32, int32_t);
+    default:
+      std::cout << "Error! Unsupported data type!\n";
+  }
+  std::cout << "]\n";
+#undef PRINT_META_DATA
+}
 }  // namespace phi
 }  // namespace kernel
 }  // namespace infrt
diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
index 25daf7027e8cb..920c0b1c8af42 100644
--- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h
+++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h
@@ -28,7 +28,8 @@ ::phi::DenseTensor CreateDenseTensorCpuF32Nchw(
     host_context::Attribute<std::vector<int64_t>> lod);
 
 void FillDenseTensorF32(::phi::DenseTensor* dense_tensor,
-                        host_context::Attribute<std::vector<int64_t>> values);
+                        host_context::Attribute<std::vector<float>> values);
+void PrintDenseTensor(::phi::DenseTensor* dense_tensor);
 
 }  // namespace phi
 }  // namespace kernel
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 2161e98fac833..37f9197edb728 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) {
   host_context::KernelRegistry registry;
   RegisterInferShapeLaunchers(&registry);
   ASSERT_GE(registry.size(), 1UL);
-  auto creator = registry.GetKernel("pten.add.cpu.any.fp32");
+  auto creator = registry.GetKernel("phi_cpu.add.any.float32");
 
   const phi::DDim dims({1, 2});
   const phi::DataType dtype{phi::DataType::FLOAT32};
diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc
index 5d79814d4bec7..15e2d21005e03 100644
--- a/paddle/infrt/kernel/phi/registry.cc
+++ b/paddle/infrt/kernel/phi/registry.cc
@@ -42,6 +42,8 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) {
       INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw));
   registry->AddKernel("phi_dt.fill_dense_tensor.f32",
                       INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32));
+  registry->AddKernel("phi_dt.print_tensor",
+                      INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor));
   registry->AddKernel(
       "phi_dt.fake_phi_kernel",
       std::bind(&KernelLauncherFunc<decltype(&FakePhiKernel),
diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
index f0b0b849b93cb..695143c93b3cf 100644
--- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
+++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir
@@ -1,13 +1,15 @@
 // RUN: infrtexec -i %s | FileCheck %s
 
-// CHECK-LABEL: @fake_phi_kernel_execute
-func @fake_phi_kernel_execute() {
+// CHECK-LABEL: @sign_any_float32_execute
+func @sign_any_float32_execute() {
   %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator
-  %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context
+  %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context
   %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
+  %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
 
-  // CHECK: @FakePhiKernel@
-  %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor<CPU, FP32, NCHW>, !infrt.dense_tensor<CPU, FP32, NCHW>) -> (!infrt.dense_tensor<CPU, FP32, NCHW>)
+  // CHECK: dense_tensor: shape=shape[1], values=[1]
+  "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor<CPU, FP32, NCHW>) -> ()
   Infrt.return
 }
 
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
index a0132501387e0..75b27e4165d17 100755
--- a/paddle/scripts/infrt_build.sh
+++ b/paddle/scripts/infrt_build.sh
@@ -33,14 +33,16 @@ function update_pd_ops() {
    rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build
    cd ${PADDLE_ROOT}/build
    cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF
-   make -j8 paddle_python
+   make -j8 paddle_python print_pten_kernels
    cd ${PADDLE_ROOT}/build
+   ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json
    cd python/dist/
    python3 -m pip uninstall -y paddlepaddle
    python3 -m pip install  *whl
    # update pd_ops.td
    cd ${PADDLE_ROOT}/tools/infrt/
    python3 generate_pd_op_dialect_from_paddle_op_maker.py
+   python3 generate_phi_kernel_dialect.py ./kernels.json
 }
 
 function init() {
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
new file mode 100644
index 0000000000000..80cf3958b156d
--- /dev/null
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -0,0 +1,276 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import sys
+
+attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'}
+supported_kernels = ['sign', 'dot', 'digamma', 'conj']
+
+target_type_converter = {"CPU": "CPU", "GPU": "GPU"}
+layout_type_converter = {
+    "NCHW": "NCHW",
+    "NHWC": "NHWC",
+    "Undefined(AnyLayout)": "ANY"
+}
+precision_type_converter = {
+    "uint8": "UINT8",
+    "int8": "INT8",
+    "int16": "INT16",
+    "int32": "INT32",
+    "int64": "INT64",
+    "float16": "FLOAT16",
+    "bfloat16": "BFLOAT16",
+    "float32": "FLOAT32",
+    "float64": "FLOAT64",
+    "complex64": "COMPLEX64",
+    "complex128": "COMPLEX128",
+    "bool": "BOOL"
+}
+
+
+def generate_kernel_name(op_name, place_str):
+    [target_, layout_, precision_] = place_str[1:-1].split(',')
+    target_ = target_type_converter[target_.strip()]
+    layout_ = layout_type_converter[layout_.strip()]
+    precision_ = precision_type_converter[precision_.strip()]
+    alias_ = "{}.{}".format(op_name, ".".join(
+        [target_.strip(), layout_.strip(), precision_.strip()]))
+    return alias_
+
+
+def generate_attrs_info(op_name, attrs_info):
+    kernel_attrs_names = {
+        'split': ['sections', 'num', 'axis', 'mkldnn_data_type'],
+        'sign': [],
+        'masked_select': [],
+        'trace': ['offset', 'axis1', 'axis2'],
+        'concat': ['axis'],
+        'empty': ['shape', 'dtype'],
+        'conj': [],
+        'norm': ['axis', 'epsilon', 'is_test'],
+        'histogram': ['bins', 'min', 'max'],
+        'dot': [],
+        'scale': ['scale', 'bias', 'bias_after_scale'],
+        'digamma': [],
+        'lerp': [],
+        'cast': ['out_dtype', 'in_dtype'],
+        'abs': []
+    }
+    attrs_args_ = ""
+    if len(kernel_attrs_names[op_name]) == len(attrs_info):
+        for index in range(len(attrs_info)):
+            attr_name = kernel_attrs_names[op_name][index]
+            attr_type = attr_type_converter[attrs_info[index]]
+            attrs_args_ += '{type_}:${name_},'.format(
+                type_=attr_type, name_=attr_name)
+    return attrs_args_[:-1]
+
+
+def generate_inputs_info(input_info):
+    input_args_ = ""
+    for index in range(len(input_info)):
+        [target_, layout_, precision_] = input_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        input_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$in{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    input_args_ = input_args_[:-1]
+    return input_args_
+
+
+def generate_arguments_info(op_name, input_info, attr_info):
+    input_args = generate_inputs_info(input_info)
+    attr_args = generate_attrs_info(op_name, attr_info)
+    context_args = "CPU_Context:$dev_ctx"
+    argument_ = "{},{},{}".format(context_args, input_args, attr_args)
+    return (("let arguments = (ins {});".format(argument_.strip(","))))
+
+
+def generate_results_info(output_info):
+    output_args_ = "let results = (outs "
+    for index in range(len(output_info)):
+        [target_, layout_, precision_] = output_info[index].split(',')
+        # todo: check vadility
+        target_ = target_type_converter[target_.strip()]
+        layout_ = layout_type_converter[layout_.strip()]
+        precision_ = precision_type_converter[precision_.strip()]
+        output_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$out{},".format(
+            target_.strip(), precision_.strip(), layout_.strip(), str(index))
+    return ("{});".format(output_args_[:-1]))
+
+
+def generate_supported_kernel_list(load_dict):
+    supported_kernels_list_ = []
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                attributes = kernel_info[kernel_alias_]["attribute"]
+                flag = True
+                for attribute in attributes:
+                    if attribute not in attr_type_converter:
+                        flag = False
+                if flag:
+                    supported_kernels_list_.append(op_name)
+
+                alias_ = generate_kernel_dialect(op_name, kernel_alias_,
+                                                 kernel_info[kernel_alias_])
+    supported_kernels_list_ = list(set(supported_kernels_list_))
+    print(supported_kernels_list_)
+
+
+def scan_kernel_info(load_dict):
+    target_type_ = []
+    layout_type_ = []
+    precision_type_ = []
+    for op_name in load_dict:
+        kernel_list = load_dict[op_name]
+        for kernel_info in kernel_list:
+            for kernel_alias_ in kernel_info:
+                [target_, layout_, precision_] = kernel_alias_[1:-1].split(',')
+                target_type_.append(target_.strip())
+                layout_type_.append(layout_.strip())
+                precision_type_.append(precision_.strip())
+    target_type_ = list(set(target_type_))
+    layout_type_ = list(set(layout_type_))
+    precision_type_ = list(set(precision_type_))
+    print(target_type_)
+    print(layout_type_)
+    print(precision_type_)
+
+
+def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=alias.replace(".", ""),
+        name=dialect_name.lower(),
+        left_brace="{")
+
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info):
+
+    alias = generate_kernel_name(op_name, kernel_alias_)
+    summary = 'let summary = "{name}";'.format(name=alias)
+    dialect_name = alias.split(".")
+    dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[
+        3]
+
+    header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format(
+        kernel_name=alias.replace(".", ""),
+        name=dialect_name.lower(),
+        left_brace="{")
+    inputs_ = kernel_info["input"]
+    attributes = kernel_info["attribute"]
+    arguments = generate_arguments_info(op_name, inputs_, attributes)
+
+    outputs = kernel_info["output"]
+    results = generate_results_info(outputs)
+
+    kernel_dialect = '{header_}\n  {summary_}\n  {arguments_}\n  {results_}\n{right_brace}\n'.format(
+        header_=header,
+        summary_=summary,
+        arguments_=arguments,
+        results_=results,
+        right_brace="}")
+    return kernel_dialect
+
+
+def generate_dialect_head():
+    comment_ = "/*===- TableGen'source file -----------------------------------------------===*\\\n\
+|*                                                                            *|\n\
+|* Kernel Definitions                                                         *|\n\
+|*                                                                            *|\n\
+|* Automatically generated file, do not edit!                                 *|\n\
+|* Generated by tools/infrt/generate_pten_kernel_dialect.py                   *|\n\
+|*                                                                            *|\n\
+\*===----------------------------------------------------------------------===*/\n"
+
+    includes_ = "#ifndef PTEN_KERNELS\n\
+#define PTEN_KERNELS\n\
+include \"mlir/Interfaces/InferTypeOpInterface.td\"\n\
+include \"mlir/Interfaces/LoopLikeInterface.td\"\n\
+include \"mlir/IR/OpBase.td\"\n\
+include \"paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td\""
+
+    return (comment_ + includes_)
+
+
+def get_kernel_target(kernel_alias_):
+    target = kernel_alias_[1:-1].split(",")
+    return target[0]
+
+
+def main(path_):
+    with open(path_, "r") as f:
+        load_dict = json.load(f)
+
+        head = generate_dialect_head()
+
+        cpu_registry_ = ""
+        gpu_registry_ = ""
+        for op_name in load_dict:
+            if op_name not in supported_kernels:
+                continue
+            kernel_list = load_dict[op_name]
+            for kernel_info in kernel_list:
+                for kernel_alias_ in kernel_info:
+                    if get_kernel_target(kernel_alias_) == "CPU":
+                        kernel_registry = generate_cpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        cpu_registry_ += kernel_registry
+                    elif get_kernel_target(kernel_alias_) == "GPU":
+                        kernel_registry = generate_gpu_kernel_dialect(
+                            op_name, kernel_alias_, kernel_info[kernel_alias_])
+                        gpu_registry_ += kernel_registry
+                    else:
+                        print("Unsupported backend:" + get_kernel_target(
+                            kernel_alias_))
+        end = "#endif  // PTEN_KERNELS"
+        with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=cpu_registry_, end_=end))
+        with open("../../paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td",
+                  "w") as dst:
+            dst.write('{start_}\n{dialect_}\n{end_}'.format(
+                start_=head, dialect_=gpu_registry_, end_=end))
+
+
+if __name__ == '__main__':
+    path = sys.argv[1]
+    main(path)
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index f3e9f345da27b..9ea3fef003054 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -150,19 +150,19 @@ def gen_dtype(vals: List[str]):
     ir_dtypes, origin_dtypes = [], []
     for val in vals:
         if val == "float":
-            ir_dtypes.append("fp32")
+            ir_dtypes.append("float32")
             origin_dtypes.append("float")
         elif val == "double":
-            ir_dtypes.append("fp64")
+            ir_dtypes.append("float64")
             origin_dtypes.append("double")
         elif val == "float16":
-            ir_dtypes.append("fp16")
+            ir_dtypes.append("float16")
             origin_dtypes.append("paddle::experimental::float16")
         elif val == "bfloat16":
             ir_dtypes.append("bf16")
             origin_dtypes.append("paddle::experimental::bfloat16")
         elif val == "bool":
-            ir_dtypes.append("int1")
+            ir_dtypes.append("bool")
             origin_dtypes.append("bool")
         elif val == "int8_t":
             ir_dtypes.append("int8")
@@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]):
         for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes):
             kernel_func = gen_kernel_func(update_item[3], ctx_name,
                                           origin_dtype)
-            ir_name = 'pten.' + '.'.join(
-                [it.lower() for it in update_item[:3]]) + "." + ir_dtype
+            ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[
+                2].lower() + '.' + ir_dtype
             res += f"""
   registry->AddKernel("{ir_name}","""
 

From f30b3f810d1b7e341507450313503cf4702f7d8a Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 2 Mar 2022 16:17:43 +0800
Subject: [PATCH 14/41] support checking `phi` directory in CI op benchmark
 (#40026)

* support phi checking in CI op benchmark

* add sparse/gpu

* remove h file in cpu directory
---
 tools/ci_op_benchmark.sh | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 1db79418b2d8f..0937ebe5343fc 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -43,20 +43,33 @@ function match_cu_file_directory {
   do
     [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0
   done
-  for sub_dir in "" "/gpu" "/hybird"
+  for sub_dir in "" "/gpu" "/gpudnn" "/sparse/gpu"
   do
     [ "${cu_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 0
   done
   return 1
 }
 
+# Limit h file directory
+function match_h_file_directory {
+  LOG "[INFO] run function match_h_file_directory"
+  local sub_dir h_file_dir
+  h_file_dir=$(dirname ${1})
+  # '.h' file should not in directory below
+  for sub_dir in "" "/cpu"
+  do
+    [ "${h_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 1
+  done
+  return 0
+}
+
 # Load op files by header file
 function load_CHANGE_OP_FILES_by_header_file {
   LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file"
   local change_file
   for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/phi/kernels/)
   do
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" ||  "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -64,6 +77,7 @@ function load_CHANGE_OP_FILES_by_header_file {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue
       LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching."
       INCLUDE_SEARCH_MAP[$change_file]="searched"
@@ -82,7 +96,7 @@ function load_CHANGE_OP_FILES {
     # match directory limit
     [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]]  || continue
     # match file name limit
-    if [[ "$change_file" =~ "_op.cu" ]]
+    if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]]
     then
       # match cu file directory limit
       match_cu_file_directory $change_file || continue
@@ -90,6 +104,7 @@ function load_CHANGE_OP_FILES {
       CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file"
     elif [[ "$change_file" =~ ".h" ]]
     then
+      match_h_file_directory $change_file || continue
       LOG "[INFO] Found \"${change_file}\" changed, keep searching."
       INCLUDE_SEARCH_MAP[${change_file}]="searched"
       load_CHANGE_OP_FILES_by_header_file $change_file
@@ -131,6 +146,8 @@ function load_CHANGE_OP_MAP {
       op_name=${change_file_name##*/}
       op_name=${op_name%_cudnn_op*}
       op_name=${op_name%_op*}
+      op_name=${op_name%_grad_kernel*}
+      op_name=${op_name%_kernel*}
       [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue
       LOG "[INFO] Load op: \"${op_name}\"."
       CHANGE_OP_MAP[${op_name}]="$change_file"

From 1c4e3e5dd0d32a4216bdad0b1cafcab4ca5ed5bb Mon Sep 17 00:00:00 2001
From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com>
Date: Wed, 2 Mar 2022 16:23:52 +0800
Subject: [PATCH 15/41] new fleet_desc builder (#39948)

* delete gloo connect retry

* the_one_ps dirs reconstruct

* .

* .

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* create the_one_ps dirs

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* the one ps dirs modify

* refactor ps optimize

* refactor ps optimize

* refactor ps optimize

* .

* .

* .

* .

* .

* .

* refactor theoneps

* the_one_ps

* add ps pass unittest

* add ps pass unittest

* ps unitest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* ps unittest frame

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* add cpu_async_ps_mode test

* ps unittest ready

* ps unittest ready

* solve dist_pass init conflict

* solve import CommContext error

* unittest ok

* implement AllocateFrom

* solve setup.py.in conflict

* solve conflict

* solve conflict

* solve conflict

* .

* .

* cpu-async-ps minimize test ok & gpu minimize test ok

* add heter 2stage unittest

* add heter 2stage unittest

* add heter 2stage unittest

* sync/geo test ok & fix heter_worker program ok

* .

* new fleet desc generator

* new fleet_desc builder

* new fleet_desc builder

* .

* .

* correct ps.proto compile

* .

Co-authored-by: zkh2016 <zhangkaihuo@baidu.com>
---
 paddle/fluid/distributed/ps/ps.proto          |   13 -
 paddle/fluid/framework/CMakeLists.txt         |    5 +-
 paddle/fluid/framework/ps.proto               |  213 ++++
 .../fleet/meta_optimizers/ps_optimizer.py     |    1 +
 python/paddle/distributed/ps/README.md        |    3 -
 python/paddle/distributed/ps/the_one_ps.py    | 1022 ++++++++---------
 .../paddle/distributed/ps/utils/ps_factory.py |    4 +-
 .../ps/utils/ps_program_builder.py            |    5 +-
 python/paddle/distributed/ps/utils/public.py  |    4 +-
 .../fluid/tests/unittests/CMakeLists.txt      |    2 +-
 .../distributed_passes/ps_pass_test_base.py   |   54 +-
 .../test_ps_trainer_pass.py                   |  122 +-
 .../fluid/tests/unittests/ps/CMakeLists.txt   |    4 +-
 .../tests/unittests/ps/ps_dnn_trainer.py      |   86 +-
 .../tests/unittests/ps/test_the_one_ps.py     |   92 +-
 .../fluid/tests/unittests/ps_dnn_model.py     |    1 +
 16 files changed, 961 insertions(+), 670 deletions(-)
 delete mode 100755 paddle/fluid/distributed/ps/ps.proto
 mode change 100644 => 100755 paddle/fluid/framework/CMakeLists.txt
 create mode 100755 paddle/fluid/framework/ps.proto
 delete mode 100755 python/paddle/distributed/ps/README.md
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/CMakeLists.txt
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
 mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py

diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/fluid/distributed/ps/ps.proto
deleted file mode 100755
index 2691f637527d4..0000000000000
--- a/paddle/fluid/distributed/ps/ps.proto
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
\ No newline at end of file
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
old mode 100644
new mode 100755
index 14aecb5fd43c4..02d90b9c6da1e
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -235,6 +235,7 @@ if(WITH_PYTHON)
   py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto)
   py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto)
   py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto)
+  py_proto_compile(ps_py_proto SRCS ps.proto)
 #Generate an empty \
     #__init__.py to make framework_py_proto as a valid python module.
   add_custom_target(fleet_proto_init ALL  
@@ -242,12 +243,13 @@ if(WITH_PYTHON)
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py 
   )
   add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
-  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto)
+  add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto)
   if (NOT WIN32)
     add_custom_command(TARGET framework_py_proto POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
       COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/
       COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+      COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
@@ -259,6 +261,7 @@ if(WITH_PYTHON)
     add_custom_command(TARGET framework_py_proto POST_BUILD
           COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto
           COMMAND copy /Y *.py ${proto_dstpath}
+      COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath}
 	  COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath}
           COMMENT "Copy generated python proto into directory paddle/fluid/proto."
 	  COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto."
diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto
new file mode 100755
index 0000000000000..0ae87812bce43
--- /dev/null
+++ b/paddle/fluid/framework/ps.proto
@@ -0,0 +1,213 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+package paddle.distributed;
+option cc_generic_services = true;
+option cc_enable_arenas = true;
+
+message FsClientParameter {
+  enum FsApiType {
+    HDFS = 0;
+    AFS = 1;
+  }
+  optional FsApiType fs_type = 1 [ default = HDFS ];
+  optional string uri = 2;        // such as afs://xxx.afs.com:9902
+  optional string user = 3;       // user_name to access fs
+  optional string passwd = 4;     // password
+  optional int32 buffer_size = 5; // buffer for read/write
+  optional string hadoop_bin = 51;
+  optional string afs_conf = 101;
+}
+
+message PSParameter {
+  optional string worker_class = 1;
+  optional string server_class = 2;
+  optional string instance_class = 3;
+  optional string init_gflags = 4 [ default = "" ];
+  optional WorkerParameter worker_param = 101;
+  optional ServerParameter server_param = 102;
+  repeated DownpourTrainerParameter trainer_param = 301;
+  optional FsClientParameter fs_client_param = 501;
+}
+
+message WorkerParameter {
+  optional DownpourWorkerParameter downpour_worker_param = 1;
+}
+
+message DownpourWorkerParameter {
+  repeated TableParameter downpour_table_param = 1;
+}
+
+message DownpourServerParameter {
+  repeated TableParameter downpour_table_param = 1;
+  optional ServerServiceParameter service_param = 2;
+}
+
+message ServerParameter {
+  optional DownpourServerParameter downpour_server_param = 1;
+}
+
+message DownpourTrainerParameter {
+  repeated DenseTableParameter dense_table = 1;
+  repeated SparseTableParameter sparse_table = 2;
+  optional int32 push_sparse_per_batch = 3;
+  optional int32 push_dense_per_batch = 4;
+  repeated string skip_op = 5;
+  repeated ProgramConfig program_config = 6;
+}
+
+message DenseTableParameter {
+  optional int32 table_id = 1;
+  repeated string dense_variable_name = 2;
+  repeated string dense_gradient_variable_name = 3;
+  optional int32 fea_dim = 4;
+}
+
+message SparseTableParameter {
+  optional int32 table_id = 1;
+  optional int32 feature_dim = 2;
+  repeated string slot_key = 3;
+  repeated string slot_value = 4;
+  repeated string slot_gradient = 5;
+}
+
+message ServerServiceParameter {
+  optional string server_class = 1 [ default = "BrpcPsServer" ];
+  optional string client_class = 2 [ default = "BrpcPsClient" ];
+  optional string service_class = 3 [ default = "BrpcPsService" ];
+  optional uint32 start_server_port = 4
+      [ default = 0 ]; // will find a avaliable port from it
+  optional uint32 server_thread_num = 5 [ default = 12 ];
+}
+
+message ProgramConfig {
+  required string program_id = 1;
+  repeated int32 push_sparse_table_id = 2;
+  repeated int32 push_dense_table_id = 3;
+  repeated int32 pull_sparse_table_id = 4;
+  repeated int32 pull_dense_table_id = 5;
+}
+
+enum TableType {
+  PS_SPARSE_TABLE = 0;
+  PS_DENSE_TABLE = 1;
+  PS_OTHER_TABLE = 2;
+}
+
+message TableParameter {
+  optional uint64 table_id = 1;
+  optional string table_class = 2;
+  optional uint64 shard_num = 3 [ default = 1000 ];
+  optional TableAccessorParameter accessor = 4;
+  optional TensorAccessorParameter tensor = 5;
+  optional CommonAccessorParameter common = 6;
+  optional TableType type = 7;
+  optional bool compress_in_save = 8 [ default = false ];
+}
+
+message TableAccessorParameter {
+  optional string accessor_class = 1;
+  optional uint32 fea_dim = 4 [ default = 11 ];
+  optional uint32 embedx_dim = 5 [ default = 8 ];
+  optional uint32 embedx_threshold = 6 [ default = 10 ];
+  optional CtrAccessorParameter ctr_accessor_param = 7;
+  repeated TableAccessorSaveParameter table_accessor_save_param = 8;
+  optional SparseCommonSGDRuleParameter embed_sgd_param = 10;
+  optional SparseCommonSGDRuleParameter embedx_sgd_param = 11;
+}
+
+message CtrAccessorParameter {
+  optional float nonclk_coeff = 1
+      [ default = 0.1 ]; // to calculate show_click_score
+  optional float click_coeff = 2
+      [ default = 1 ]; // to calculate show_click_score
+  optional float base_threshold = 3 [
+    default = 1.5
+  ]; // show_click_score > base_threshold, this feature can be saved
+  optional float delta_threshold = 4
+      [ default =
+            0.25 ]; // delta_score > delta_threshold, this feature can be saved
+  optional float delta_keep_days = 5
+      [ default =
+            16 ]; // unseen_day < delta_keep_days, this feature can be saved
+  optional float show_click_decay_rate = 6 [
+    default = 0.98
+  ]; // show/click will update to show/click * show_click_decay_rate after a day
+  optional float delete_threshold = 7
+      [ default = 0.8 ]; // threshold to shrink a feasign
+  optional float delete_after_unseen_days = 8
+      [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature
+                        // will be delete in shrink_model
+  optional int32 ssd_unseenday_threshold = 9
+      [ default = 1 ]; // threshold to save ssd
+}
+
+message TensorAccessorParameter {
+  optional string feed_var_name = 1;
+  optional string fetch_var_name = 2;
+  optional int64 startup_program_id = 3;
+  optional int64 main_program_id = 4;
+  optional string tensor_table_class = 6;
+}
+
+message CommonAccessorParameter {
+  optional string name = 1;
+  optional string table_name = 2;
+  repeated string attributes = 3;
+  repeated string params = 4;
+  repeated uint32 dims = 5;
+  repeated string initializers = 6;
+  optional string entry = 7;
+  optional int32 trainer_num = 8;
+  optional bool sync = 9;
+  optional uint32 table_num = 10;
+  optional uint32 table_dim = 11;
+}
+
+message TableAccessorSaveParameter {
+  optional uint32 param = 1;
+  optional string converter = 2;
+  optional string deconverter = 3;
+}
+
+message SparseCommonSGDRuleParameter {
+  optional string name = 1;
+  optional SparseNaiveSGDRuleParameter naive = 2;
+  optional SparseAdagradSGDRuleParameter adagrad = 3;
+  optional SparseAdamSGDParameter adam = 4;
+}
+
+message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  repeated float weight_bounds = 3;
+}
+
+message
+    SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule
+  optional double learning_rate = 1 [ default = 0.05 ];
+  optional double initial_g2sum = 2 [ default = 3.0 ];
+  optional double initial_range = 3 [ default = 0.0001 ];
+  repeated float weight_bounds = 4;
+}
+
+message SparseAdamSGDParameter { // SparseAdamSGDRule
+  optional double learning_rate = 1 [ default = 0.001 ];
+  optional double initial_range = 2 [ default = 0.0001 ];
+  optional double beta1_decay_rate = 3 [ default = 0.9 ];
+  optional double beta2_decay_rate = 4 [ default = 0.999 ];
+  optional double ada_epsilon = 5 [ default = 1e-08 ];
+  repeated float weight_bounds = 6;
+}
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
index 100a6882b1b35..00937dbe7a432 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
@@ -54,6 +54,7 @@ def _init_ps_pass_context(self, loss, startup_program):
         attrs['cloned_startup'] = attrs['origin_startup_program'].clone()
 
         attrs['user_defined_strategy'] = self.user_defined_strategy
+        attrs['valid_strategy'] = self.user_defined_strategy
         attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy)
         attrs['ps_mode'] = attrs['trainer'].mode
         logger.info("ps_mode: {}".format(attrs['ps_mode']))
diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md
deleted file mode 100755
index 8d28031794f5d..0000000000000
--- a/python/paddle/distributed/ps/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# 目录说明
-
-* 改完之后，上层目录中 fleet 中相关文件（夹）就可以删除
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 14a68ad916747..cc744bc9d9edb 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -15,10 +15,11 @@
 import warnings
 
 import os
+from paddle.distributed.fleet.proto import ps_pb2
 import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
 from paddle.fluid import core
-from .utils.public import *
+from paddle.distributed.ps.utils.public import *
 from paddle.fluid.framework import Program
 from paddle.fluid.compiler import CompiledProgram
 from paddle.fluid.executor import Executor
@@ -29,14 +30,10 @@
 from paddle.fluid.communicator import Communicator, HeterClient
 from google.protobuf import text_format
 
-__all__ = []
-
-
-def conv_indent(indent):
-    return "".join([" "] * indent)
-
-
-PSERVER_SAVE_SUFFIX = ".shard"
+__all__ = [
+    'Table', 'SparseTable', 'GeoSparseTable', 'BarrierTable', 'TensorTable',
+    'DenseTable'
+]
 
 
 def get_program_by_id(context, program_id):
@@ -62,129 +59,140 @@ def parse_table_class(varname, program_id, context):
                 return "MemorySparseTable"
 
 
-def get_default_accessor_proto(accessor, varname, program_id, context):
+def check_embedding_dim(accessor_proto, varname, program_id, context):
     main_program, startup_program = get_program_by_id(context, program_id)
     embedding_dim = 0
     for var in main_program.list_vars():
         if var.name == varname:
             embedding_dim = var.shape[1]
+            print('new var: {}, {}, {}'.format(var, embedding_dim,
+                                               accessor_proto.fea_dim))
             break
-
-    if not accessor.HasField("accessor_class"):
-        accessor.accessor_class = "CtrCommonAccessor"
-    if not accessor.HasField("fea_dim"):
-        accessor.fea_dim = embedding_dim + 2
-    if not accessor.HasField("embedx_dim"):
-        accessor.embedx_dim = embedding_dim - 1
-    if not accessor.HasField("embedx_threshold"):
-        accessor.embedx_threshold = 0
-
-    ctr_accessor_param = accessor.ctr_accessor_param
-    if not ctr_accessor_param.HasField("nonclk_coeff"):
-        ctr_accessor_param.nonclk_coeff = 0.1
-    if not ctr_accessor_param.HasField("click_coeff"):
-        ctr_accessor_param.click_coeff = 1.0
-    if not ctr_accessor_param.HasField("base_threshold"):
-        ctr_accessor_param.base_threshold = 0
-    if not ctr_accessor_param.HasField("delta_threshold"):
-        ctr_accessor_param.delta_threshold = 0
-    if not ctr_accessor_param.HasField("delta_keep_days"):
-        ctr_accessor_param.delta_keep_days = 16
-    if not ctr_accessor_param.HasField("show_click_decay_rate"):
-        ctr_accessor_param.show_click_decay_rate = 1
-    if not ctr_accessor_param.HasField("delete_threshold"):
-        ctr_accessor_param.delete_threshold = 0
-    if not ctr_accessor_param.HasField("delete_after_unseen_days"):
-        ctr_accessor_param.delete_after_unseen_days = 30
-    if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
-        ctr_accessor_param.ssd_unseenday_threshold = 1
-
-    for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]:
-        if not sgd_param.HasField("name"):
-            sgd_param.name = "SparseAdaGradSGDRule"
-        if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
-            if not sgd_param.adagrad.HasField("learning_rate"):
-                sgd_param.adagrad.learning_rate = 0.05
-            if not sgd_param.adagrad.HasField("initial_g2sum"):
-                sgd_param.adagrad.initial_g2sum = 3.0
-            if not sgd_param.adagrad.HasField("initial_range"):
-                sgd_param.adagrad.initial_range = 0.0001
-            if len(sgd_param.adagrad.weight_bounds) == 0:
-                sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseNaiveSGDRule":
-            if not sgd_param.naive.HasField("learning_rate"):
-                sgd_param.naive.learning_rate = 0.05
-            if not sgd_param.naive.HasField("initial_range"):
-                sgd_param.naive.initial_range = 0.0001
-            if len(sgd_param.naive.weight_bounds) == 0:
-                sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
-        if sgd_param.name == "SparseAdamSGDRule":
-            if not sgd_param.adam.HasField("learning_rate"):
-                sgd_param.adam.learning_rate = 0.001
-            if not sgd_param.adam.HasField("initial_range"):
-                sgd_param.adam.initial_range = 0.0001
-            if not sgd_param.adam.HasField("beta1_decay_rate"):
-                sgd_param.adam.beta1_decay_rate = 0.9
-            if not sgd_param.adam.HasField("beta2_decay_rate"):
-                sgd_param.adam.beta2_decay_rate = 0.999
-            if not sgd_param.adam.HasField("ada_epsilon"):
-                sgd_param.adam.ada_epsilon = 1e-08
-            if len(sgd_param.adam.weight_bounds) == 0:
-                sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
-
-
-def check_embedding_dim(accessor, varname, program_id, context):
-    main_program, startup_program = get_program_by_id(context, program_id)
-    embedding_dim = 0
-    for var in main_program.list_vars():
-        if var.name == varname:
-            embedding_dim = var.shape[1]
-            break
-    fea_dim = accessor.fea_dim
+    fea_dim = accessor_proto.fea_dim
     if fea_dim != embedding_dim + 2:
         raise ValueError(
             "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}".
             format(embedding_dim + 2, fea_dim))
-    embedx_dim = accessor.embedx_dim
+    embedx_dim = accessor_proto.embedx_dim
     if embedx_dim != embedding_dim - 1:
         raise ValueError(
             "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}".
             format(embedding_dim - 1, embedx_dim))
 
 
+class Service:
+    def __init__(self):
+        pass
+
+    def _set(self, service_proto):
+        service_proto.server_class = "BrpcPsServer"
+        service_proto.client_class = "BrpcPsClient"
+        service_proto.service_class = "BrpcPsService"
+        service_proto.start_server_port = 0
+        service_proto.server_thread_num = 12
+
+
+class GpuService(Service):
+    def __init__(self):
+        super(GpuService).__init__(self)
+
+    def _set(self, service_proto):
+        super(GpuService)._set(service_proto)
+        service_proto.server_class = 'PsLocalServer'
+        service_proto.client_class = 'PsLocalClient'
+
+
 class Accessor:
     def __init__(self):
         self.accessor_class = ""
         self.optimizer = None
-        self.feature_dim = -1
-        self.embedding_dim = -1
-        self.optimizer = None
-
-    def to_string(self, indent):
-        accessor_str = "{}accessor {{{}\n{}}}"
-        attrs = ""
-        attrs += "accessor_class: \"{}\" ".format(self.accessor_class)
-        attrs += "fea_dim: {} ".format(self.feature_dim)
-        attrs += "embedx_dim: {} ".format(self.embedding_dim)
-        attrs += "\n"
-        if self.optimizer is not None:
-            attrs += self.optimizer.to_string(indent)
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+        self.feature_dim = 0
+        self.embedding_dim = 0
 
+    # TableAccessorParameter accessor
+    def _set(self, accessor_proto, varname, program_id, context):
+        main_program, startup_program = get_program_by_id(context, program_id)
+        embedding_dim = 0
+        for var in main_program.list_vars():
+            if var.name == varname:
+                embedding_dim = var.shape[1]
+                break
 
-class CommonAccessor:
+        if not accessor_proto.HasField("accessor_class"):
+            accessor_proto.accessor_class = "CtrCommonAccessor"
+        if not accessor_proto.HasField("fea_dim"):
+            accessor_proto.fea_dim = embedding_dim + 2
+        if not accessor_proto.HasField("embedx_dim"):
+            accessor_proto.embedx_dim = embedding_dim - 1
+        if not accessor_proto.HasField("embedx_threshold"):
+            accessor_proto.embedx_threshold = 0
+
+        ctr_accessor_param = accessor_proto.ctr_accessor_param
+        if not ctr_accessor_param.HasField("nonclk_coeff"):
+            ctr_accessor_param.nonclk_coeff = 0.1
+        if not ctr_accessor_param.HasField("click_coeff"):
+            ctr_accessor_param.click_coeff = 1.0
+        if not ctr_accessor_param.HasField("base_threshold"):
+            ctr_accessor_param.base_threshold = 0
+        if not ctr_accessor_param.HasField("delta_threshold"):
+            ctr_accessor_param.delta_threshold = 0
+        if not ctr_accessor_param.HasField("delta_keep_days"):
+            ctr_accessor_param.delta_keep_days = 16
+        if not ctr_accessor_param.HasField("show_click_decay_rate"):
+            ctr_accessor_param.show_click_decay_rate = 1
+        if not ctr_accessor_param.HasField("delete_threshold"):
+            ctr_accessor_param.delete_threshold = 0
+        if not ctr_accessor_param.HasField("delete_after_unseen_days"):
+            ctr_accessor_param.delete_after_unseen_days = 30
+        if not ctr_accessor_param.HasField("ssd_unseenday_threshold"):
+            ctr_accessor_param.ssd_unseenday_threshold = 1
+
+        for sgd_param in [
+                accessor_proto.embed_sgd_param, accessor_proto.embedx_sgd_param
+        ]:
+            if not sgd_param.HasField("name"):
+                sgd_param.name = "SparseAdaGradSGDRule"
+            if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
+                if not sgd_param.adagrad.HasField("learning_rate"):
+                    sgd_param.adagrad.learning_rate = 0.05
+                if not sgd_param.adagrad.HasField("initial_g2sum"):
+                    sgd_param.adagrad.initial_g2sum = 3.0
+                if not sgd_param.adagrad.HasField("initial_range"):
+                    sgd_param.adagrad.initial_range = 0.0001
+                if len(sgd_param.adagrad.weight_bounds) == 0:
+                    sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseNaiveSGDRule":
+                if not sgd_param.naive.HasField("learning_rate"):
+                    sgd_param.naive.learning_rate = 0.05
+                if not sgd_param.naive.HasField("initial_range"):
+                    sgd_param.naive.initial_range = 0.0001
+                if len(sgd_param.naive.weight_bounds) == 0:
+                    sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
+            if sgd_param.name == "SparseAdamSGDRule":
+                if not sgd_param.adam.HasField("learning_rate"):
+                    sgd_param.adam.learning_rate = 0.001
+                if not sgd_param.adam.HasField("initial_range"):
+                    sgd_param.adam.initial_range = 0.0001
+                if not sgd_param.adam.HasField("beta1_decay_rate"):
+                    sgd_param.adam.beta1_decay_rate = 0.9
+                if not sgd_param.adam.HasField("beta2_decay_rate"):
+                    sgd_param.adam.beta2_decay_rate = 0.999
+                if not sgd_param.adam.HasField("ada_epsilon"):
+                    sgd_param.adam.ada_epsilon = 1e-08
+                if len(sgd_param.adam.weight_bounds) == 0:
+                    sgd_param.adam.weight_bounds.extend([-10.0, 10.0])
+
+
+class CommonAccessor(Accessor):
     def __init__(self):
-        self.accessor_class = ""
-        self.table_name = None
-        self.entry = None
+        super(CommonAccessor, self).__init__()
+        self.table_name = ''
+        self.entry = 'none'
         self.attrs = []
         self.params = []
         self.dims = []
         self.trainer_num = 0
-        self.sync = "false"
-        self.table_num = None
-        self.table_dim = None
+        self.sync = False
         self.initializers = []
         self.opt_input_map = {}
         self.opt_attr_map = {}
@@ -422,233 +430,361 @@ def parse_by_optimizer(self, ctx, context):
         self.initializers = initializers
         self.attrs = attrs
 
-    def to_string(self, indent):
-        accessor_str = "{}common {{{}\n{}}}"
-        attrs = ""
-        attrs += "name: \"{}\" ".format(self.accessor_class)
-
-        if self.table_name:
-            attrs += "table_name: \"{}\" ".format(self.table_name)
-
-        if self.entry:
-            attrs += "entry: \"{}\" ".format(self.entry)
-        attrs += "trainer_num: {} ".format(self.trainer_num)
-        attrs += "sync: {} ".format(self.sync)
-        if self.table_num:
-            attrs += "table_num: {} ".format(self.table_num)
-        if self.table_dim:
-            attrs += "table_dim: {} ".format(self.table_dim)
-
-        for param in self.params:
-            attrs += "params: \"{}\" ".format(param)
-
-        for dim in self.dims:
-            attrs += "dims: {} ".format(dim)
-
-        for initializer in self.initializers:
-            attrs += "initializers: \"{}\" ".format(initializer)
-
-        attrs += "\n"
-        return accessor_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    # CommonAccessorParameter common
+    def _set(self, proto):
+        proto.name = self.accessor_class
+        proto.table_name = self.table_name
+        proto.params.extend(self.params)
+        proto.dims.extend(self.dims)
+        proto.initializers.extend(self.initializers)
+        proto.entry = self.entry
+        proto.trainer_num = self.trainer_num
+        proto.sync = self.sync
+        proto.table_num = self.table_num
+        proto.table_dim = self.table_dim
 
 
 class Tensor:
-    def __init__(self):
-        self.main_program_id = None
-        self.startup_program_id = None
-        self.feed_var_name = None
-        self.fetch_var_name = None
-        self.tensor_table_class = False
-
-    def to_string(self, indent):
-        program_str = "{}tensor {{{}\n{}}}"
-        attrs = ""
-        attrs += "feed_var_name: \"{}\" ".format(str(self.feed_var_name))
-        attrs += "fetch_var_name: \"{}\" ".format(str(self.fetch_var_name))
-        attrs += "startup_program_id: {} ".format(str(self.startup_program_id))
-        attrs += "main_program_id: {} ".format(str(self.main_program_id))
-        attrs += "tensor_table_class: \"{}\" ".format(
-            str(self.tensor_table_class))
-        attrs += "\n"
-        return program_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+    def __init__(self, tesnor_dcit):
+        self.tensor_dict = tesnor_dcit
+
+    def _set(self, tensor_proto):
+        tensor_proto.main_program_id = self.tensor_dict.get("main_program_id",
+                                                            0)
+        tensor_proto.startup_program_id = self.tensor_dict.get(
+            "startup_program_id", 0)
+        tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '')
+        tensor_proto.fetch_var_name = self.tensor_dict.get("fetch_var_name", '')
+        tensor_proto.tensor_table_class = self.tensor_dict.get(
+            "tensor_table_class", '')
 
 
 class Table:
     def __init__(self):
-        self.id = -1
         self.table_class = None
         self.shard_num = -1
         self.type = None
-        self.accessor = None
-        self.common = None
+        self.accessor = Accessor()
+        self.shard_num = 256
+        self.common = CommonAccessor()
         self.tensor = None
-        self.accessor_proto = None
-
-    def to_string(self, indent):
-        # if self.id == 1:
-        #     proto_txt = ''
-        #     with open('./sparse_table.prototxt') as f:
-        #         proto_txt = f.read()
-        #     return proto_txt
-        table_str = "{}downpour_table_param {{{}\n{}}}"
-
-        attrs = ""
-        attrs += "table_id: {} ".format(self.id)
-        attrs += "table_class: \"{}\" ".format(self.table_class)
-        attrs += "shard_num: {} ".format(self.shard_num)
-        attrs += "type: {}".format(self.type)
-        attrs += "\n"
-        indent += 2
-
-        if self.accessor_proto is not None:
-            accessor_str = "{}accessor {{{}\n{}}}"
-            accessor_str = accessor_str.format(
-                conv_indent(indent), self.accessor_proto, conv_indent(indent))
-            attrs += accessor_str + "\n"
-        elif self.accessor is not None:
-            attrs += self.accessor.to_string(indent)
-            attrs += "\n"
-
-        if self.tensor is not None:
-            attrs += self.tensor.to_string(indent)
-            attrs += "\n"
-
-        if self.common is not None:
-            attrs += self.common.to_string(indent)
-            attrs += "\n"
-
-        return table_str.format(conv_indent(indent), attrs, conv_indent(indent))
 
+    def _set(self, table_proto):
+        pass
 
-class Service:
-    def __init__(self):
-        self.server_class = "BrpcPsServer"
-        self.client_class = "BrpcPsClient"
-        self.service_class = "BrpcPsService"
-        self.start_server_port = 0
-        self.server_thread_num = 12
 
-    def to_string(self, indent):
-        service_str = "{}service_param {{{}\n{}}}"
+class BarrierTable(Table):
+    def __init__(self, context, idx):
+        super(BarrierTable, self).__init__()
+        self.type = None
+        self.shard_num = 256
+        self.accessor.accessor_class = 'CommMergeAccessor'
+        self.common.attrs = ""
+        self.common.dims = []
+        self.common.params = []
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.role_maker = context['role_maker']
+        self.idx = idx
+        self.is_sync = context['is_sync']
+
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.table_class = 'BarrierTable'
+        table_proto.shard_num = 256
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
+        table_proto.accessor.fea_dim = 0
+        table_proto.accessor.embedx_dim = 0
+
+        table_proto.common.name = ""
+        table_proto.common.table_name = "barrier_table"
+        table_proto.common.sync = self.is_sync
+        table_proto.common.entry = 'none'
+
+        trainer_num = get_trainers(self.role_maker)
+        if self.is_heter_ps_mode:
+            trainer_num += len(self.role_maker._get_heter_worker_endpoints())
+        table_proto.common.trainer_num = trainer_num
 
-        attrs = ""
-        attrs += "server_class: \"{}\" ".format(self.server_class)
-        attrs += "client_class: \"{}\" ".format(self.client_class)
-        attrs += "service_class: \"{}\" ".format(self.service_class)
-        attrs += "start_server_port: {} ".format(self.start_server_port)
-        attrs += "server_thread_num: {} ".format(self.server_thread_num)
 
-        return service_str.format(
-            conv_indent(indent), attrs, conv_indent(indent))
+class TensorTable(Table):
+    def __init__(self, idx, tensor_dict, role_maker):
+        super(TensorTable, self).__init__()
+        self.idx = idx
+        self.tensor_dict = tensor_dict
+        self.role_maker = role_maker
 
+    def _set(self, table_proto):
+        table_proto.table_id = self.idx
+        table_proto.type = ps_pb2.PS_OTHER_TABLE
+        table_proto.table_class = self.tensor_dict.get("tensor_table_class", '')
 
-class DownpourServer:
-    def __init__(self):
-        self.service = None
-        self.tables = []
+        table_proto.accessor.accessor_class = "CommMergeAccessor"
 
-    def set_service_param(self, service):
-        self.service = service
+        table_proto.common.table_name = self.tensor_dict.get("feed_var_name",
+                                                             '')
+        table_proto.common.trainer_num = get_trainers(self.role_maker)
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+        tensor = Tensor(self.tensor_dict)
+        tensor._set(table_proto.tensor)
 
-    def to_string(self, indent):
-        server_str = "{}downpour_server_param {{{}\n{}}}"
 
-        table_strs = ""
-        indent += 2
+class SparseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(SparseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.type = None
+        self.table_class = 'MemorySparseTable'
+        self.accessor = Accessor()
 
-        table_strs += "\n"
-        table_strs += self.service.to_string(indent)
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+
+        print('new table_name: {}'.format(self.common.table_name))
+        all_table_proto = self.context[
+            "user_defined_strategy"].sparse_table_configs
+        usr_table_proto = all_table_proto.add()
+        for proto in all_table_proto:
+            if proto.table_name == self.common.table_name:
+                usr_table_proto = proto
+                break
+        table_proto.table_class = 'MemorySparseTable'
+        warnings.warn("The PS mode must use MemorySparseTable.")
+        if usr_table_proto.HasField("shard_num"):
+            table_proto.shard_num = usr_table_proto.shard_num
+        else:
+            table_proto.shard_num = 1000
+            warnings.warn(
+                "The shard_num of sparse table is not set, use default value 1000."
+            )
 
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
-        return server_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+        if usr_table_proto.accessor.ByteSize() == 0:
+            warnings.warn(
+                "The accessor of sparse table is not set, use default value.")
 
+        table_proto.accessor.ParseFromString(
+            usr_table_proto.accessor.SerializeToString())
+        self.accessor._set(table_proto.accessor, self.common.table_name,
+                           ctx.program_id(), self.context)
 
-class Server:
-    def __init__(self):
-        self.servers = []
+        check_embedding_dim(table_proto.accessor, self.common.table_name,
+                            ctx.program_id(), self.context)
 
-    def add_server(self, server):
-        if not isinstance(server, DownpourServer):
-            raise ValueError("only support instance DownpourServer")
-        self.servers.append(server)
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
 
-    def __str__(self):
-        server_str = "server_param {{{}\n}}"
-        indent = 2
-        servers_str = ""
-        for server in self.servers:
-            servers_str += "\n"
-            servers_str += server.to_string(indent)
+        self.common._set(table_proto.common)
 
-        return server_str.format(servers_str)
 
+class GeoSparseTable(SparseTable):
+    def __init__(self, context, send_ctx):
+        super(GeoSparseTable, self).__init__(context, send_ctx)
+        self.table_class = "SparseGeoTable"
+        if self.context['ps_mode'] != DistributedMode.GEO:
+            raise ValueError("not geo sparse table!")
+
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == False):
+            return
+        table_proto.table_id = ctx.table_id()
+        table_proto.table_class = self.table_class
+        table_proto.type = ps_pb2.PS_SPARSE_TABLE
+        table_proto.shard_num = self.shard_num
+
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = ctx.sections()[1]
+
+        self.common.table_name = self.context['grad_name_to_param_name'][
+            ctx.origin_varnames()[0]]
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = False
+        self.common._set(table_proto.common)
+
+
+class DenseTable(Table):
+    def __init__(self, context, send_ctx):
+        super(DenseTable, self).__init__()
+        self.context = context
+        self.ctx = send_ctx
+        self.accessor = Accessor()
 
-class DownpourWorker:
+    def _set(self, table_proto):
+        ctx = self.ctx
+        if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or (
+                ctx.is_sparse() == True):
+            return
+
+        table_proto.table_id = ctx.table_id()
+
+        table_proto.type = ps_pb2.PS_DENSE_TABLE
+        table_proto.table_class = "CommonDenseTable"
+        table_proto.shard_num = 256
+
+        table_proto.accessor.accessor_class = 'CommMergeAccessor'
+        table_proto.accessor.fea_dim = ctx.sections()[0]
+        table_proto.accessor.embedx_dim = 1
+
+        self.common.table_name = "MergedDense"
+        adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
+        self.common.parse_by_optimizer(ctx, self.context)
+        self.common.parse_entry(self.common.table_name,
+                                ctx.program_id(), self.context)
+        self.common.sync = True if self.context['is_sync'] else False
+
+        self.common._set(table_proto.common)
+
+
+class Server:
     def __init__(self):
-        self.tables = []
+        pass
 
-    def append_tables(self, table):
-        if not isinstance(table, Table):
-            raise ValueError("only support instance Table")
-        self.tables.append(table)
+    def _set(self):
+        pass
 
-    def to_string(self, indent):
-        worker_str = "{}downpour_worker_param {{{}\n{}}}"
-        table_strs = ""
-        indent += 2
-        for table in self.tables:
-            table_strs += "\n"
-            table_strs += table.to_string(indent)
 
-        return worker_str.format(
-            conv_indent(indent), table_strs, conv_indent(indent))
+class DownpourServer(Server):
+    def __init__(self):
+        super(DownpourServer, self).__init__()
+
+    def _set(self):
+        pass
 
 
 class Worker:
     def __init__(self):
-        self.workers = []
+        pass
 
-    def add_worker(self, worker):
-        if not isinstance(worker, DownpourWorker):
-            raise ValueError("only support instance DownpourWorker")
-        self.workers.append(worker)
+    def _set(self):
+        pass
 
-    def __str__(self):
-        worker_str = "worker_param {{{}\n}}"
-        indent = 2
-        workers_str = ""
-        for worker in self.workers:
-            workers_str += "\n"
-            workers_str += worker.to_string(indent)
 
-        return worker_str.format(workers_str)
+class DownpourWorker(Worker):
+    def __init__(self):
+        super(DownpourWorker, self).__init__()
+
+    def _set(self):
+        pass
 
 
 class fsClient:
-    def __init__(self, proto):
-        self.proto = proto
-        self.uri = proto.uri
-        self.user = proto.user
-        self.passwd = proto.passwd
-        self.hadoop_bin = proto.hadoop_bin
-
-    def to_string(self):
-        proto_txt = text_format.MessageToString(self.proto)
-        if proto_txt:
-            fs_str = "fs_client_param {{\n{}}}"
-            return fs_str.format(proto_txt)
+    def __init__(self, fs_client_param):
+        self.fs_client_param = fs_client_param
+
+    def _set(self, proto):
+        if not text_format.MessageToString(self.fs_client_param):
+            return
+        proto.uri = self.fs_client_param.uri
+        proto.user = self.fs_client_param.user
+        proto.passwd = self.fs_client_param.passwd
+        proto.hadoop_bin = self.fs_client_param.hadoop_bin
+
+
+class PsDescBuilder(object):
+    def __init__(self, context):
+        self.context = context
+        self.is_sync = context['is_sync']
+        self.ps_mode = context['ps_mode']
+        self.is_heter_ps_mode = context['is_heter_ps_mode']
+        self.use_ps_gpu = context['use_ps_gpu']
+        self.send_ctx = get_the_one_send_context(
+            self.context,
+            use_origin_program=True,
+            split_dense_table=self.is_heter_ps_mode)
+
+        self.tensor_table_dict = {}  # TODO
+        self._server_sub_program = []
+
+        self.tables = self._get_tables()
+
+        self.service = self._get_service()
+        self.fs_client = self._get_fs_client()
+
+        self.ps_desc = ps_pb2.PSParameter()
+
+    def _get_tensor_tables(self):
+        program_idx = 0
+        if not self.tensor_table_dict:
+            self._server_sub_program.append(Program().desc)
+        tables = []
+        for table_name in self.tensor_table_dict:
+            tables.append(globals()['TensorTable'](len(tables), tensor_dict,
+                                                   self.context['role_maker']))
+            program_idx += 1
+        return tables
+
+    def _get_tables(self):
+        tables = []
+        for idx, (name, ctx) in enumerate(self.send_ctx.items()):
+            print('####### {}\n'.format(ctx.is_sparse()))
+            if ctx.is_sparse():
+                if self.ps_mode == DistributedMode.GEO:
+                    tables.append(globals()['GeoSparseTable'](self.context,
+                                                              ctx))
+                else:
+                    tables.append(globals()['SparseTable'](self.context, ctx))
+            else:
+                tables.append(globals()['DenseTable'](self.context, ctx))
+        self.tensor_tables = self._get_tensor_tables()
+        tables.extend(self.tensor_tables)
+        tables.append(globals()['BarrierTable'](self.context, len(tables)))
+        return tables
+
+    def _get_service(self):
+        if self.use_ps_gpu:
+            return GpuService()
         else:
-            return ""
+            return Service()
+
+    def _get_fs_client(self):
+        return fsClient(self.context["user_defined_strategy"].fs_client_param)
+
+    def build_worker_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.worker_param.downpour_worker_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        return text_format.MessageToString(self.ps_desc)
+
+    def build_server_desc(self):
+        for table in self.tables:
+            table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add(
+            )
+            table._set(table_proto)
+            self.sparse_table_maps = {}
+            if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None:
+                self.sparse_table_maps[
+                    table_proto.common.table_name] = table_proto.table_id
+
+        self.service._set(
+            self.ps_desc.server_param.downpour_server_param.service_param)
+        self.fs_client._set(self.ps_desc.fs_client_param)
+        return text_format.MessageToString(self.ps_desc)
 
 
 class TheOnePSRuntime(RuntimeBase):
@@ -665,8 +801,11 @@ def _set_basic_info(self, context):
         self.role_maker = context["role_maker"]
 
         self.origin_main_program = context["origin_main_program"]
-        self.origin_main_programs = context["origin_main_programs"]
-
+        self.origin_main_programs = context.get("origin_main_programs",
+                                                [self.origin_main_program])
+        self.context["origin_main_programs"] = self.origin_main_programs
+        self.context["origin_startup_programs"] = context.get(
+            'origin_startup_programs', [context['origin_startup_program']])
         self.context[
             'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode
         self.is_heter_ps_mode = self.context['is_heter_ps_mode']
@@ -675,15 +814,23 @@ def _set_basic_info(self, context):
         self.context['ps_mode'] = self.context['trainer'].mode
         self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[
             'use_ps_gpu']
-        self.is_sync = True if self.context[
+        self.context['is_sync'] = True if self.context[
             'ps_mode'] == DistributedMode.SYNC else False
         self.context['grad_name_to_param_name'] = {}
         self.context['tensor_table'] = {}
         build_var_distributed(self.context)
 
+        endpoints = get_ps_endpoints(self.role_maker)
+        self.string_hosts = []
+        for idx, ep in enumerate(endpoints):
+            host, port = ep.split(":")
+            pshost = fluid.core.PSHost(host, int(port), idx)
+            self.string_hosts.append(pshost.serialize_to_string())
+
+        self.ps_desc_builder = PsDescBuilder(self.context)
+
     def _init_worker(self):
-        worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync)
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
+        worker_desc = self.ps_desc_builder.build_worker_desc()
 
         if self.context['use_ps_gpu']:
             main_program = self.context['loss'].block.program
@@ -701,23 +848,11 @@ def sync_strategy_envs():
             kwargs["trainer_id"] = self.role_maker._worker_index()
             return kwargs
 
-        proto_txt = str(worker) + "\n" + str(server)
-        with open('proto_txt', 'w') as f:
-            f.write(proto_txt)
-
+        proto_txt = worker_desc + "\n" + server_desc
         debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-
         if debug:
             print("worker: \n{}".format(proto_txt))
 
-        endpoints = get_ps_endpoints(self.role_maker)
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
-
         dense_map = get_the_one_recv_context(
             self.context, split_dense_table=self.is_heter_ps_mode)
         send_ctx = get_the_one_send_context(
@@ -741,7 +876,7 @@ def sync_strategy_envs():
         kwargs["trainer_id"] = self.role_maker._role_id()
         kwargs["trainers"] = self.role_maker._worker_num()
 
-        for table in server.servers[0].tables:
+        for table in server.servers[0].tables:  #TODO
             if table.table_class == "BarrierTable":
                 kwargs["barrier_table_id"] = table.id
                 break
@@ -755,7 +890,8 @@ def sync_strategy_envs():
             trainer_config.mode, kwargs,
             trainer_config.get_communicator_flags())
         self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt,
-                                         string_hosts, fluid.global_scope())
+                                         self.string_hosts,
+                                         fluid.global_scope())
 
         fleet.util.barrier()
         info = self._communicator.get_client_info()
@@ -812,275 +948,16 @@ def sync_strategy_envs():
                                                  previous_trainers,
                                                  self.role_maker._role_id())
 
-    def _push_sparse_param(self,
-                           var_name,
-                           table_id=-1,
-                           scope=fluid.global_scope()):
-        self._communicator.push_sparse_param(var_name, table_id, scope)
-
-    def _get_executor(self):
-        executor = fluid.Executor(fluid.CPUPlace())
-        if self.is_heter_ps_mode:
-            if self.role_maker._is_heter_worker():
-                heter_device_type = self.role_maker._heter_device_type().upper()
-                if heter_device_type not in ["GPU", "XPU", "CPU"]:
-                    raise ValueError("Heter Worker Not Support Device {}".
-                                     format(device_type))
-                if heter_device_type == "GPU":
-                    executor = Executor(
-                        fluid.CUDAPlace(
-                            int(os.getenv("FLAGS_selected_gpus", "0"))))
-                elif heter_device_type == "XPU":
-                    executor = Executor(
-                        fluid.XPUPlace(
-                            int(os.getenv("FLAGS_selected_xpus", "0"))))
-        return executor
-
-    def _get_fleet_proto(self, is_server, is_sync, **kwargs):
-        def _build_merge_accessor(ctx):
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-
-            if ctx.is_sparse():
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = ctx.sections()[1]
-            else:
-                accessor.feature_dim = ctx.sections()[0]
-                accessor.embedding_dim = 1
-
-            return accessor
-
-        def _build_barrier_table(idx):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = "BarrierTable"
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = "barrier_table"
-            trainer_num = get_trainers(self.context['role_maker'])
-            if self.is_heter_ps_mode:
-                trainer_num += len(self.role_maker._get_heter_worker_endpoints(
-                ))
-            common.trainer_num = trainer_num
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-            return table
-
-        def _build_tensor_table(idx, tensor_dict):
-            table = Table()
-            table.id = idx
-            table.type = "PS_OTHER_TABLE"
-            table.table_class = tensor_dict["tensor_table_class"]
-            table.shard_num = 256
-
-            accessor = Accessor()
-            accessor.accessor_class = "CommMergeAccessor"
-            accessor.optimizer = None
-            accessor.feature_dim = 0
-            accessor.embedding_dim = 0
-            table.accessor = accessor
-
-            common = CommonAccessor()
-            common.table_name = tensor_dict["feed_var_name"]
-            common.trainer_num = get_trainers(self.role_maker)
-            common.attrs = ""
-            common.dims = []
-            common.params = []
-            table.common = common
-
-            tensor = Tensor()
-            tensor.main_program_id = tensor_dict["main_program_id"]
-            tensor.startup_program_id = tensor_dict["startup_program_id"]
-            tensor.feed_var_name = tensor_dict["feed_var_name"]
-            tensor.fetch_var_name = tensor_dict["fetch_var_name"]
-            tensor.tensor_table_class = tensor_dict["tensor_table_class"]
-            table.tensor = tensor
-
-            return table
-
-        def _add_tensor_table(tables):
-            tensor_table_dict = {}
-            program_idx = 0
-            for table_name in tensor_table_dict:
-                if tensor_table_dict[table_name]["startup_program"] != None:
-                    tensor_table_dict[table_name][
-                        "startup_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["startup_program"].desc)
-                    program_idx += 1
-                if tensor_table_dict[table_name]["main_program"] != None:
-                    tensor_table_dict[table_name][
-                        "main_program_id"] = program_idx
-                    self._server_sub_program.append(tensor_table_dict[
-                        table_name]["main_program"].desc)
-                    program_idx += 1
-                # Todo: Hard code for lr_decay table apply table id
-                new_table = _build_tensor_table(
-                    len(tables), tensor_table_dict[table_name])
-                tables.append(new_table)
-            return tables
-
-        def _get_tables():
-            send_ctx = get_the_one_send_context(
-                self.context,
-                use_origin_program=True,
-                split_dense_table=self.is_heter_ps_mode)
-
-            tables = []
-            for idx, (name, ctx) in enumerate(send_ctx.items()):
-                print(" wxm python test send_ctx.items-->", idx, (name, ctx))
-                if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1:
-                    continue
-
-                table = Table()
-                table.id = ctx.table_id()
-                common = CommonAccessor()
-
-                if ctx.is_sparse():
-                    table.type = "PS_SPARSE_TABLE"
-                    table.shard_num = 256
-
-                    common.table_name = self.context['grad_name_to_param_name'][
-                        ctx.origin_varnames()[0]]
-
-                    if self.context['ps_mode'] == DistributedMode.GEO:
-                        table.table_class = "SparseGeoTable"
-                    else:
-                        all_table_proto = self.context[
-                            "user_defined_strategy"].sparse_table_configs
-                        table_proto = all_table_proto.add()
-                        for proto in all_table_proto:
-                            if proto.table_name == common.table_name:
-                                table_proto = proto
-                                break
-                        if table_proto.HasField("table_class"):
-                            table.table_class = table_proto.table_class
-                        else:
-                            table.table_class = parse_table_class(
-                                common.table_name,
-                                ctx.program_id(), self.context)
-                        if table.table_class != 'MemorySparseTable':
-                            table.table_class = 'MemorySparseTable'
-                            warnings.warn(
-                                "The PS mode must use MemorySparseTable.")
-
-                        if table_proto.HasField("shard_num"):
-                            table.shard_num = table_proto.shard_num
-                        else:
-                            table.shard_num = 1000
-                            warnings.warn(
-                                "The shard_num of sparse table is not set, use default value 1000."
-                            )
-
-                        if table_proto.accessor.ByteSize() == 0:
-                            warnings.warn(
-                                "The accessor of sparse table is not set, use default value."
-                            )
-                        get_default_accessor_proto(
-                            table_proto.accessor, common.table_name,
-                            ctx.program_id(), self.context)
-                        check_embedding_dim(table_proto.accessor,
-                                            common.table_name,
-                                            ctx.program_id(), self.context)
-                        table.accessor_proto = text_format.MessageToString(
-                            table_proto.accessor)
-                else:
-                    table.type = "PS_DENSE_TABLE"
-                    table.table_class = "CommonDenseTable"
-                    table.shard_num = 256
-                    common.table_name = "MergedDense"
-
-                adam_d2sum = self.context["user_defined_strategy"].adam_d2sum
-                common.parse_by_optimizer(ctx, self.context)
-
-                if ctx.is_sparse():
-                    common.parse_entry(common.table_name,
-                                       ctx.program_id(), self.context)
-
-                if is_sync:
-                    common.sync = "true"
-                else:
-                    common.sync = "false"
-                table.common = common
-
-                if table.table_class != 'MemorySparseTable':
-                    accessor = _build_merge_accessor(ctx)
-                    table.accessor = accessor
-                tables.append(table)
-
-            tensor_table_dict = {}
-            if len(tensor_table_dict) > 0:
-                tables = _add_tensor_table(tables)
-            else:
-                empty_porgram = Program()
-                self._server_sub_program.append(empty_porgram.desc)
-
-            barrier_table = _build_barrier_table(len(tables))
-            tables.append(barrier_table)
-            return tables
-
-        if is_server:
-            server = Server()
-            downpour_server = DownpourServer()
-
-            service = Service()
-            dist_strategy = self.context["valid_strategy"]
-            use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"]
-            if use_ps_gpu:
-                service.server_class = "PsLocalServer"
-                service.client_class = "PsLocalClient"
-            downpour_server.set_service_param(service)
-
-            tables = _get_tables()
-            downpour_server.tables = tables
-            server.add_server(downpour_server)
-            return server
-        else:
-            worker = Worker()
-            downpour_worker = DownpourWorker()
-
-            tables = _get_tables()
-            downpour_worker.tables = tables
-            worker.add_worker(downpour_worker)
-            return worker
-
     def _init_server(self, dirname=None, var_names=None, **kwargs):
+        server_desc = self.ps_desc_builder.build_server_desc()
         role_id = get_role_id(self.role_maker)
-        endpoints = get_ps_endpoints(self.role_maker)
         trainers = get_trainers(self.role_maker)
         if self.is_heter_ps_mode:
             trainers += len(self.role_maker._get_heter_worker_endpoints())
-        server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync)
-        proto_txt = str(server)
-        fs_client = fsClient(self.context["user_defined_strategy"]
-                             .fs_client_param)
-        proto_txt = proto_txt + "\n" + fs_client.to_string()
-
-        debug = bool(int(os.getenv("PSERVER_DEBUG", "0")))
-        if debug:
-            print("server: \n{}".format(proto_txt))
-
-        string_hosts = []
-        for idx, ep in enumerate(endpoints):
-            host, port = ep.split(":")
-            pshost = fluid.core.PSHost(host, int(port), idx)
-            string_hosts.append(pshost.serialize_to_string())
 
         self._server = fluid.core.DistFleetWrapper()
-        self._server.init_server(proto_txt, string_hosts, role_id, trainers,
-                                 self._server_sub_program)
+        self._server.init_server(server_desc, self.string_hosts, role_id,
+                                 trainers, self._server_sub_program)
 
         dist_varnames = get_sparse_tablenames(self.origin_main_programs, True)
         sparse_varnames = get_sparse_tablenames(self.origin_main_programs,
@@ -1101,10 +978,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs):
         if dirname is None or not load_varnames:
             return
 
-        sparse_table_maps = {}
-        for table in server.servers[0].tables:
-            if table.type == "PS_SPARSE_TABLE" and table.common is not None:
-                sparse_table_maps[table.common.table_name] = table.id
+        sparse_table_maps = self.ps_desc_builder.sparse_table_maps
 
         dirname = os.path.normpath(dirname)
         pserver_id = self.role_maker._role_id()
@@ -1186,7 +1060,7 @@ def _save_distributed_persistables(self,
         sparses = get_the_one_recv_context(
             self.context,
             is_dense=False,
-            split_dense_table=self.is_heter_ps_mod,
+            split_dense_table=self.is_heter_ps_mode,
             use_origin_program=True)
 
         sparse_varnames = self._save_sparse_params(executor, dirname, sparses,
@@ -1413,7 +1287,7 @@ def _shrink(self, threshold=None):
 
         fleet.util.barrier()
         if self.role_maker._is_first_worker():
-            sparses = sget_the_one_recv_context(
+            sparses = get_the_one_recv_context(
                 self.context,
                 is_dense=False,
                 split_dense_table=self.role_maker.
diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py
index 1a426f3ad6c6a..701ae8be6cb9c 100755
--- a/python/paddle/distributed/ps/utils/ps_factory.py
+++ b/python/paddle/distributed/ps/utils/ps_factory.py
@@ -38,5 +38,7 @@ def _create_ps_program_builder(self, pass_ctx):
         elif 'is_fl_ps_mode' in attrs and attrs[
                 'is_fl_ps_mode'] == DistributedMode.FL:
             return globals()['FlPsProgramBuilder'](pass_ctx)
-        else:
+        elif attrs['ps_mode'] == DistributedMode.SYNC:
             return globals()['CpuSyncPsProgramBuilder'](pass_ctx)
+        else:
+            return globals()['CpuAsyncPsProgramBuilder'](pass_ctx)
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index 25e4dc28bdcb8..d737542f32344 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -95,11 +95,12 @@ def _build_trainer_programs(self):
 
 class CpuSyncPsProgramBuilder(PsProgramBuilder):
     def __init__(self, pass_ctx):
-        logger.info("start building cpu-sync-ps program")
         super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx)
+        if self.ps_mode == DistributedMode.SYNC:
+            logger.info("start building cpu-sync-ps program")
         if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC:
             raise ValueError("ps mode: {} not matched {}",
-                             format(self.ps_mode, "CpuSyncPsProgramBuilder"))
+                             format(self.ps_mode, "PsProgramBuilder"))
 
     def _build_trainer_programs(self):
         add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass",
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index ebec6900e38f5..ab5bd7da09dfc 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -73,7 +73,9 @@ def logger_config(log_path, logging_name):
     return logger
 
 
-logger = logger_config(log_path='/ps_log', logging_name='ps_log')
+ps_log_root_dir = '/ps_log/'
+logger = logger_config(
+    log_path='/ps_usr_print_log', logging_name='ps_usr_print_log')
 
 
 class DistributedMode:
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
old mode 100644
new mode 100755
index 2f6df075478e6..1443eebf29384
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -627,7 +627,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_passes)
-  
+    add_subdirectory(ps)
     add_subdirectory(auto_parallel)
 
     # FIXME(typhoonzero): add these tests back
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
index 63dd4b8e21e07..93a0044a5e43c 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py
@@ -23,13 +23,24 @@
 import numpy as np
 from collections import OrderedDict
 from paddle.distributed.ps.utils.public import logger
-from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
+from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists
 import paddle.distributed.fleet as fleet
 
 
 class PsPassTestBase(unittest.TestCase):
     def init(self):
-        raise NotImplementedError
+        self.config = {}
+        self.config['ps_mode_config'] = ""
+        self.config['worker_num'] = "1"
+        self.config['server_num'] = "1"
+        self.config['run_minimize'] = "0"
+        self.config['run_single_pass'] = "0"
+        self.config['run_the_one_ps'] = '0'
+        self.config['debug_new_minimize'] = "0"
+        self.config['debug_new_pass'] = "0"
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ""
+        self.config['applied_pass_name'] = ""
 
     def setUp(self):
         print('Ps setUp...')
@@ -37,7 +48,7 @@ def setUp(self):
     def tearDown(self):
         print('Ps tearDown...')
 
-    def ps_launch(self, config, ps_mode="cpu-ps"):
+    def ps_launch(self, ps_mode="cpu-ps"):
         if ps_mode == "cpu-ps" or ps_mode == 'heter-ps':
             os.environ['WITH_DISTRIBUTE'] = 'ON'
 
@@ -45,23 +56,26 @@ def ps_launch(self, config, ps_mode="cpu-ps"):
                 sys.executable,
                 "-u",
             ] + [
-                "-m", "launch", "--log_dir", config['log_dir'], "--worker_num",
-                config['worker_num'], "--server_num", config['server_num']
+                "-m", "launch", "--log_dir", self.config['log_dir'],
+                "--worker_num", self.config['worker_num'], "--server_num",
+                self.config['server_num']
             ]
             if ps_mode == 'heter-ps':
                 os.environ['FLAGS_START_PORT'] = '12004'
                 cmd += [
-                    '--heter_worker_num', config['heter_worker_num'],
-                    '--heter_devices', config['heter_devices']
+                    '--heter_worker_num', self.config['heter_worker_num'],
+                    '--heter_devices', self.config['heter_devices']
                 ]
 
             cmd += [
-                "../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'],
-                "--run_minimize", config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                "../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'],
+                "--run_minimize", self.config['run_minimize'],
+                "--run_single_pass", self.config['run_single_pass'],
+                "--run_the_one_ps", self.config['run_the_one_ps'],
+                "--debug_new_pass", self.config['debug_new_pass'],
+                "--debug_new_minimize", self.config['debug_new_minimize'],
+                "--applied_pass_name", self.config['applied_pass_name'],
+                "--debug_the_one_ps", self.config['debug_the_one_ps']
             ]
         elif ps_mode == "gpu-ps":
             os.environ['FLAGS_LAUNCH_BARRIER'] = '0'
@@ -80,12 +94,14 @@ def ps_launch(self, config, ps_mode="cpu-ps"):
 
             cmd = [
                 sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m",
-                config['ps_mode_config'], "--run_minimize",
-                config['run_minimize'], "--run_single_pass",
-                config['run_single_pass'], "--debug_new_pass",
-                config['debug_new_pass'], "--debug_new_minimize",
-                config['debug_new_minimize'], "--applied_pass_name",
-                config['applied_pass_name']
+                self.config['ps_mode_config'], "--run_minimize",
+                self.config['run_minimize'], "--run_single_pass",
+                self.config['run_single_pass'], "--run_the_one_ps",
+                self.config['run_the_one_ps'], "--debug_new_pass",
+                self.config['debug_new_pass'], "--debug_new_minimize",
+                self.config['debug_new_minimize'], "--applied_pass_name",
+                self.config['applied_pass_name'], "--debug_the_one_ps",
+                self.config['debug_the_one_ps']
             ]
 
         cmd = [shlex.quote(c) for c in cmd]
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
index b186869ee9747..fd558ef040329 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
@@ -21,31 +21,26 @@
 
 import paddle
 from ps_pass_test_base import *
-from paddle.distributed.ps.utils.public import logger
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
 from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer
 
 
 class TestPsTrainerPass(PsPassTestBase):
-    def init(self):
-        self.config = {}
-        self.config['ps_mode_config'] = ""
-        self.config['worker_num'] = "1"
-        self.config['server_num'] = "1"
-        self.config['run_minimize'] = "0"
-        self.config['run_single_pass'] = "0"
-        self.config['debug_new_minimize'] = "0"
-        self.config['debug_new_pass'] = "0"
-        self.config['log_dir'] = ""
-        self.config['applied_pass_name'] = ""
-
     def setUp(self):
         pass
 
     def tearDown(self):
         pass
 
-    def check(self):
-        pass
+    def check(self, file1, file2):
+        with open(file1, 'r', encoding='utf-8') as f:
+            text1 = f.read()
+        with open(file2, 'r', encoding='utf-8') as f:
+            text2 = f.read()
+        if text1 == text2:
+            return True
+        else:
+            return False
 
     def test_ps_optimizer_minimize_cpu_async(self):
         self.init()
@@ -53,16 +48,21 @@ def test_ps_optimizer_minimize_cpu_async(self):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/async_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/async_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_async passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_async failed!')
 
     def test_ps_optimizer_minimize_cpu_sync(self):
         self.init()
@@ -70,16 +70,22 @@ def test_ps_optimizer_minimize_cpu_sync(self):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/sync_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/sync_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
-
-        self.check()
+        self.ps_launch()
+        '''
+        file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_sync failed!')
+        '''
 
     def test_ps_optimizer_minimize_cpu_geo(self):
         self.init()
@@ -87,16 +93,21 @@ def test_ps_optimizer_minimize_cpu_geo(self):
         self.config['run_minimize'] = '1'
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/geo_cpu_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/geo_cpu_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config)
+        self.ps_launch()
 
-        self.check()
+        file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_cpu_geo passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_cpu_geo failed!')
 
     # heter ps 二阶段
     def test_ps_optimizer_minimize_heter(self):
@@ -110,14 +121,24 @@ def test_ps_optimizer_minimize_heter(self):
         self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.config['log_dir'] = "/heter_log_old_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
 
         self.config['debug_new_minimize'] = '1'
-        self.config['log_dir'] = "/heter_log_new_minimize"
+        self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize"
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, 'heter-ps')
+        self.ps_launch('heter-ps')
+        '''
+        file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
+        file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
+        file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
+        if self.check(file1, file2) and self.check(file3, file4):
+            logger.info('test_ps_optimizer_minimize_heter passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_heter failed!')
+        '''
 
     def test_ps_optimizer_minimize_gpu(self):
         self.init()
@@ -125,29 +146,42 @@ def test_ps_optimizer_minimize_gpu(self):
         self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml"
 
         self.config['debug_new_minimize'] = '0'
-        self.ps_launch(self.config, "gpu-ps")
+        self.ps_launch("gpu-ps")
 
         self.config['debug_new_minimize'] = '1'
-        self.ps_launch(self.config, "gpu-ps")
+        self.ps_launch("gpu-ps")
 
-        self.check()
+        file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_ps_optimizer_minimize_gpu passed!')
+        else:
+            logger.error('test_ps_optimizer_minimize_gpu failed!')
 
     def test_append_send_ops_pass(self):
         self.init()
         self.config['run_single_pass'] = '1'
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
         self.config['applied_pass_name'] = "append_send_ops_pass"
 
         self.config['debug_new_pass'] = '0'
-        self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
+        self.ps_launch("cpu-ps")
 
         self.config['debug_new_pass'] = '1'
-        self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name']
+        self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[
+            'applied_pass_name']
         remove_path_if_exists(self.config['log_dir'])
-        self.ps_launch(self.config, "cpu-ps")
-
-        self.check()
+        self.ps_launch("cpu-ps")
+
+        file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
+        file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
+        if self.check(file1, file2):
+            logger.info('test_append_send_ops_pass passed!')
+        else:
+            logger.info('test_append_send_ops_pass failed!')
 
     def test_distributed_ops_pass(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
old mode 100644
new mode 100755
index 3aef3283b8200..9af32a8aca741
--- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
@@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    list(APPEND TEST_OPS ${TEST_OP})
+    set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
 endforeach(TEST_OP)
-
-set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50)
diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
index d08c1d41c89ec..bc87fc255a59b 100755
--- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
+++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py
@@ -264,12 +264,16 @@ def parse_args():
         '--run_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--run_single_pass', type=int, default=0, help="test single pass")
+    parser.add_argument(
+        '--run_the_one_ps', type=int, default=0, help="test the_one_ps")
     parser.add_argument(
         '--debug_new_minimize', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--debug_new_pass', type=int, default=0, help="test single pass")
     parser.add_argument(
         '--applied_pass_name', type=str, default="", help="test single pass")
+    parser.add_argument(
+        '--debug_the_one_ps', type=int, default=0, help="test the_one_ps")
 
     args = parser.parse_args()
     args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml))
@@ -280,9 +284,11 @@ def parse_args():
     config["pure_bf16"] = args.pure_bf16
     config['run_minimize'] = args.run_minimize
     config['run_single_pass'] = args.run_single_pass
+    config['run_the_one_ps'] = args.run_the_one_ps
     config['debug_new_minimize'] = args.debug_new_minimize
     config['debug_new_pass'] = args.debug_new_pass
     config['applied_pass_name'] = args.applied_pass_name
+    config['debug_the_one_ps'] = args.debug_the_one_ps
     yaml_helper.print_yaml(config)
     return config
 
@@ -344,15 +350,15 @@ def run_minimize(self):
             fleet_obj.minimize(loss)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_server_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config['debug_new_minimize']) + '_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
         elif self.role_maker._is_heter_worker():
-            _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str(
+            _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str(
                 self.config[
                     'debug_new_minimize']) + '_heter_worker_main.prototxt'
             debug_program(_main_file, loss.block.program)
@@ -397,16 +403,84 @@ def run_single_pass(self):
             _main = worker.append_send_ops_pass(_main, compiled_config)
 
         if fleet.is_server():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_server_main.prototxt'
             debug_program(_main_file, _main)
         elif fleet.is_worker():
-            _main_file = '/' + sync_mode + "_" + str(config[
+            _main_file = ps_log_root_dir + sync_mode + "_" + str(config[
                 "applied_pass_name"]) + '_debug:_' + str(self.config[
                     'debug_new_pass']) + '_worker_main.prototxt'
             debug_program(_main_file, _main)
 
+    def run_the_one_ps(self):
+        self.init_fleet_with_gloo()
+        self.model = get_model(self.config)
+        self.input_data = self.model.create_feeds()
+        self.metrics = self.model.net(self.input_data)
+        loss = self.model._cost
+        user_defined_strategy = get_user_defined_strategy(self.config)
+        learning_rate = self.config.get(
+            "hyper_parameters.optimizer.learning_rate")
+        sync_mode = self.config.get("runner.sync_mode")
+        inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True)
+
+        self.role_maker._generate_role()  # 必要
+        if self.config['debug_the_one_ps'] == 1:
+            logger.info("entering run_the_one_ps -- new")
+
+            from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer
+            ps_optimizer = ParameterServerOptimizer(inner_optimizer)
+            ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer,
+                                         user_defined_strategy)
+            ps_optimizer.minimize_impl(loss)
+
+            from paddle.distributed.ps.the_one_ps import TheOnePSRuntime
+            _runtime_handle = TheOnePSRuntime()  # ps 目录下重构版的 TheOnePSRuntime
+            _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs)
+            if fleet.is_worker():
+                worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_worker_ps_desc', 'w') as f:
+                    f.write(worker_desc)
+            if fleet.is_server():
+                server_desc = _runtime_handle.ps_desc_builder.build_server_desc(
+                )
+                with open(ps_log_root_dir + sync_mode + '_' +
+                          'new_server_ps_desc', 'w') as f:
+                    f.write(server_desc)
+
+        else:
+            pass
+        '''          
+            logger.info("entering run_the_one_ps -- old")
+            fleet_obj = fleet.distributed_optimizer(
+                inner_optimizer, user_defined_strategy)  
+            fleet_obj.minimize(loss)  
+            if fleet.is_worker():
+                worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False)
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f:
+                    f.write(str(worker_desc) + str(server_desc))
+            if fleet.is_server():
+                server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False)
+                with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f:
+                    f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string()))
+        '''
+        if fleet.is_server():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_server_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif fleet.is_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+        elif self.role_maker._is_heter_worker():
+            _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str(
+                self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt'
+            debug_program(_main_file, loss.block.program)
+
 
 if __name__ == "__main__":
     paddle.enable_static()
@@ -418,3 +492,5 @@ def run_single_pass(self):
         benchmark_main.run_single_pass()
     elif config['run_minimize'] == 1:
         benchmark_main.run_minimize()
+    elif config['run_the_one_ps'] == 1:
+        benchmark_main.run_the_one_ps()
diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
old mode 100644
new mode 100755
index 78bae0e50c580..8dddc6abd4ced
--- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
+++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
@@ -22,16 +22,100 @@
 import paddle
 import paddle.fluid as fluid
 
+import paddle
+from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import *
+from paddle.distributed.ps.utils.public import logger, ps_log_root_dir
+from ps_dnn_trainer import DnnTrainer
+from paddle.distributed.fleet.proto import ps_pb2
+from google.protobuf import text_format
+
 
-class TestTheOnePs(unittest.TestCase):
+class TestTheOnePs(PsPassTestBase):
     def setUp(self):
-        print('setUp...')
+        pass
 
     def tearDown(self):
-        print('tearDown...')
+        pass
 
-    def test_main(self):
+    def check(self, file1, file2):
         pass
+        '''
+        f = open(file1, "rb")
+        ps_desc_1 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_1)
+        f.close()
+
+        f = open(file2, "rb")
+        ps_desc_2 = ps_pb2.PSParameter()
+        text_format.Parse(f.read(), ps_desc_2)
+        f.close()
+        str1 = text_format.MessageToString(ps_desc_1)
+        str2 = text_format.MessageToString(ps_desc_2)
+        #logger.info('### msg10: {}'.format(str1))
+        #logger.info('### msg20: {}'.format(str2))
+        if str1 == str2:
+            return True
+        else:
+            return False
+        '''
+
+    def test_ps_cpu_async(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config[
+            'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/async_worker_ps_desc'
+        desc2 = '/ps_log/async_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/async_server_ps_desc'
+        desc4 = '/ps_log/async_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_async ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_async ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_async ps_desc: server failed!')
+
+    def test_ps_cpu_geo(self):
+        self.init()
+        self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml"
+        self.config['run_the_one_ps'] = '1'
+
+        self.config['debug_the_one_ps'] = '0'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        self.config['debug_the_one_ps'] = '1'
+        self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps"
+        remove_path_if_exists(self.config['log_dir'])
+        self.ps_launch()
+
+        desc1 = '/ps_desc_baseline/geo_worker_ps_desc'
+        desc2 = '/ps_log/geo_new_worker_ps_desc'
+        desc3 = '/ps_desc_baseline/geo_server_ps_desc'
+        desc4 = '/ps_log/geo_new_server_ps_desc'
+        if self.check(desc1, desc2):
+            logger.info('test_ps_cpu_geo ps_desc: worker passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: worker failed!')
+        if self.check(desc3, desc4):
+            logger.info('test_ps_cpu_geo ps_desc: server passed!')
+        else:
+            logger.info('test_ps_cpu_geo ps_desc: server failed!')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
index 0a147334dab26..8d91e0f4678cb 100755
--- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py
+++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py
@@ -74,6 +74,7 @@ def forward(self, sparse_inputs, dense_inputs):
             else:
                 emb = self.embedding(s_input)
             emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim])
+            # emb.stop_gradient = True
             sparse_embs.append(emb)
 
         y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1)

From 28795771408a6dcd757ed367d348fb0ead5ab507 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 2 Mar 2022 16:40:05 +0800
Subject: [PATCH 16/41] run recompute's real backward with amp disabled
 (#40042)

---
 python/paddle/distributed/fleet/utils/recompute.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
index dccd7f6205302..4ccb48ef72e71 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -182,9 +182,10 @@ def backward(ctx, *args):
                     "none of output has requires_grad=True, this recompute() is not necessary"
                 )
 
-            # actually backward            
-            paddle.autograd.backward(forward_outputs_with_grad,
-                                     backward_inputs_with_grad)
+            # actually backward
+            with paddle.amp.auto_cast(enable=False):
+                paddle.autograd.backward(forward_outputs_with_grad,
+                                         backward_inputs_with_grad)
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))

From 8492d3bbf6f01e98d6674b57b27913fe537584dd Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Wed, 2 Mar 2022 16:43:52 +0800
Subject: [PATCH 17/41] The backward code of Sparse Conv3d (#40054)

Sparse Conv3d backward code
---
 .../kernels/sparse/convolution_grad_kernel.h  |  66 +++++++
 paddle/phi/kernels/sparse/cpu/convolution.h   |   1 +
 .../sparse/cpu/convolution_grad_kernel.cc     | 166 ++++++++++++++++++
 .../kernels/test_sparse_conv3d_dev_api.cc     | 112 +++++++++++-
 4 files changed, 337 insertions(+), 8 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc

diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
new file mode 100644
index 0000000000000..1a6ac852448a5
--- /dev/null
+++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad);
+
+template <typename T, typename Context>
+std::vector<DenseTensor> Conv3dGrad(const Context& dev_ctx,
+                                    const SparseCooTensor& x,
+                                    const DenseTensor& rulebook,
+                                    const DenseTensor& kernel,
+                                    const SparseCooTensor& out_grad,
+                                    const std::vector<int>& paddings,
+                                    const std::vector<int>& dilations,
+                                    const std::vector<int>& strides,
+                                    const int groups) {
+  DenseTensor x_grad = phi::Empty<T, Context>(dev_ctx);
+  DenseTensor kernel_grad = phi::Empty<T, Context>(dev_ctx);
+  Conv3dGradKernel<T, Context>(dev_ctx,
+                               x,
+                               rulebook,
+                               kernel,
+                               out_grad,
+                               paddings,
+                               dilations,
+                               strides,
+                               groups,
+                               &x_grad,
+                               &kernel_grad);
+  std::vector<DenseTensor> out(2);
+  out[0] = x_grad;
+  out[1] = kernel_grad;
+  return out;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h
index 5803069d927d7..ab2fef5320f71 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution.h
+++ b/paddle/phi/kernels/sparse/cpu/convolution.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 namespace phi {
 namespace sparse {
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
new file mode 100644
index 0000000000000..d4f770ce8713a
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
+
+namespace phi {
+namespace sparse {
+
+// rulebook:
+//[
+//  [kernel_index],
+//  [in_i],
+//  [out_i],
+//]
+// x_grad = out_grad * transpose(kenrel)
+// kernel_grad = transpose(x) * out_grad
+template <typename T, typename Context>
+void Conv3dGradKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const DenseTensor& rulebook,
+                      const DenseTensor& kernel,
+                      const SparseCooTensor& out_grad,
+                      const std::vector<int>& paddings,
+                      const std::vector<int>& dilations,
+                      const std::vector<int>& strides,
+                      const int groups,
+                      DenseTensor* x_grad,
+                      DenseTensor* kernel_grad) {
+  const auto& kernel_dims = kernel.dims();
+  const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  const int* rulebook_ptr = rulebook.data<int>();
+
+  const int rulebook_len = rulebook.dims()[1];
+
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta d_x_features_meta(
+      x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_grad_features_meta(
+      x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor d_x_features =
+      phi::Empty(dev_ctx, std::move(d_x_features_meta));
+  phi::DenseTensor out_grad_features =
+      phi::Empty(dev_ctx, std::move(out_grad_features_meta));
+
+  dev_ctx.Alloc(
+      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  dev_ctx.Alloc(
+      &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel());
+  T* d_x_features_ptr = d_x_features.data<T>();
+  dev_ctx.Alloc(&out_grad_features,
+                out_grad_features.dtype(),
+                sizeof(T) * out_grad_features.numel());
+  T* out_grad_features_ptr = out_grad_features.data<T>();
+  kernel_grad->Resize(kernel_dims);
+  dev_ctx.Alloc(
+      kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
+  T* d_kernel_ptr = kernel_grad->data<T>();
+
+  Gather<T>(x.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len,
+            rulebook_len,
+            in_channels,
+            in_features_ptr);
+  Gather<T>(out_grad.non_zero_elements().data<T>(),
+            rulebook_ptr + rulebook_len * 2,
+            rulebook_len,
+            out_channels,
+            out_grad_features_ptr);
+
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
+  for (int i = 0; i < rulebook_len; i++) {
+    counter[rulebook_ptr[i]] += 1;
+  }
+  int offset = 0;
+  for (int i = 0; i < kernel_size; i++) {
+    offsets[i] = offset;
+    offset += counter[i];
+  }
+  offsets[kernel_size] = offset;
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (counter[i] <= 0) {
+      continue;
+    }
+
+    const int M = counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels;
+    T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels;
+    T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels;
+
+    // call gemm: d_kernel = transpose(x) * out_grad
+    // (in_channels, n) * (n, out_channels)
+    blas.GEMM(CblasTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_out_grad_ptr,
+              static_cast<T>(0),
+              tmp_d_kernel_ptr);
+
+    // call gemm: d_x = out_grad * transpose(kernel)
+    // (n, out_channels) * (out_channels, in_channels)
+    blas.GEMM(CblasNoTrans,
+              CblasTrans,
+              M,
+              K,
+              N,
+              static_cast<T>(1),
+              tmp_out_grad_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_d_x_ptr);
+  }
+
+  // 4. scatter
+  x_grad->Resize(x.non_zero_elements().dims());
+  dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel());
+  T* x_grad_values_ptr = x_grad->data<T>();
+  memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel());
+  Scatter<T>(d_x_features_ptr,
+             rulebook.data<int>() + rulebook_len,
+             rulebook_len,
+             in_channels,
+             x_grad_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 576015143704b..00b2a256a9504 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -59,7 +60,10 @@ void TestConv3dBase(const std::vector<int>& indices,
                     const std::vector<int>& paddings,
                     const std::vector<int>& strides,
                     const std::vector<int>& dilations,
-                    const float diff = 1e-3) {
+                    const float diff = 1e-3,
+                    const bool backward = false,
+                    const std::vector<T> features_grad = {},
+                    const std::vector<T> kernel_grad = {}) {
   phi::CPUContext dev_ctx_cpu;
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
@@ -122,10 +126,29 @@ void TestConv3dBase(const std::vector<int>& indices,
                              correct_out_indices.size() * sizeof(int));
     ASSERT_EQ(cmp_indices, 0);
 
-    for (uint64_t i = 0; i < correct_out_features.size(); i++) {
-      float tmp = std::fabs(static_cast<float>(
-          correct_out_features[i] - out.non_zero_elements().data<T>()[i]));
-      ASSERT_LT(tmp, diff);
+    auto f_verify = [&](const T* real_data,
+                        const std::vector<T>& correct_data) {
+      for (uint64_t i = 0; i < correct_data.size(); i++) {
+        float tmp =
+            std::fabs(static_cast<float>(correct_data[i] - real_data[i]));
+        ASSERT_LT(tmp, diff);
+      }
+    };
+
+    f_verify(out.non_zero_elements().data<T>(), correct_out_features);
+
+    if (backward) {
+      std::vector<DenseTensor> grads = sparse::Conv3dGrad<T>(dev_ctx_cpu,
+                                                             x_tensor,
+                                                             rulebook,
+                                                             kernel_tensor,
+                                                             out,
+                                                             paddings,
+                                                             dilations,
+                                                             strides,
+                                                             1);
+      f_verify(grads[0].data<T>(), features_grad);
+      f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
 }
@@ -141,7 +164,11 @@ void TestConv3d(const std::vector<int>& indices,
                 const int non_zero_num,
                 const std::vector<int>& paddings,
                 const std::vector<int>& strides,
-                const std::vector<int>& dilations) {
+                const std::vector<int>& dilations,
+                const float diff = 1e-3,
+                const bool backward = false,
+                const std::vector<float> features_grad = {},
+                const std::vector<float> kernel_grad = {}) {
   // test float
   TestConv3dBase<float>(indices,
                         features,
@@ -154,7 +181,11 @@ void TestConv3d(const std::vector<int>& indices,
                         non_zero_num,
                         paddings,
                         strides,
-                        dilations);
+                        dilations,
+                        diff,
+                        backward,
+                        features_grad,
+                        kernel_grad);
   // test double
   TestConv3dBase<double>(indices,
                          cast<float, double>(features),
@@ -167,7 +198,11 @@ void TestConv3d(const std::vector<int>& indices,
                          non_zero_num,
                          paddings,
                          strides,
-                         dilations);
+                         dilations,
+                         diff,
+                         backward,
+                         cast<float, double>(features_grad),
+                         cast<float, double>(kernel_grad));
 }
 
 TEST(DEV_API, sparse_conv3d) {
@@ -467,5 +502,66 @@ TEST(DEV_API, sparse_conv2d) {
              dilations);
 }
 
+TEST(DEV_API, sparse_conv3d_backward) {
+  const int in_channels = 1;
+  const int out_channels = 1;
+  DDim x_dims = {1, 4, 4, 4, in_channels};
+  DDim kernel_dims = {3, 3, 3, in_channels, out_channels};
+  DDim out_dims = {1, 2, 2, 2, out_channels};
+  std::vector<int> paddings = {0, 0, 0};
+  std::vector<int> strides = {1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1};
+
+  const int non_zero_num = 2;
+  std::vector<int> indices_flatten = {0, 0, 0, 2, 3, 2, 3, 2};
+
+  std::vector<float> features = {-0.28833008, 0.0287323};
+  // 3*3*3=27
+  std::vector<float> kernel = {
+      0.64306641, 0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641,
+      0.57861328, 0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038,
+      0.46459961, 0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077,
+      0.69628906, 0.26904297, 0.74707031, 0.54003906, 0.5390625,  0.07958984,
+      0.47338867, 0.90966797, 0.17126465};
+
+  std::vector<int> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
+                                          1, 1, 0, 1, 0, 1, 0, 1, 0, 1};
+
+  std::vector<float> out_features = {4.9200e-03,
+                                     2.6140e-02,
+                                     2.2900e-03,
+                                     -2.3596e-01,
+                                     1.5000e-04,
+                                     1.0670e-02,
+                                     5.7200e-03,
+                                     1.2850e-02};
+
+  std::vector<float> features_grad = {-0.20593, -0.09149};
+  std::vector<float> kernel_grad = {
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 0.000e+00, 6.805e-02, 0.000e+00, 0.000e+00,  0.000e+00,
+      0.000e+00, 3.700e-04, 1.600e-04, 0.000e+00, 3.100e-04,  0.000e+00,
+      0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, -6.780e-03, 7.000e-05,
+      0.000e+00, 7.500e-04, 1.400e-04};
+
+  TestConv3d(indices_flatten,
+             features,
+             x_dims,
+             kernel,
+             kernel_dims,
+             out_indices_flatten,
+             out_features,
+             out_dims,
+             non_zero_num,
+             paddings,
+             strides,
+             dilations,
+             1e-3,
+             true,
+             features_grad,
+             kernel_grad);
+}
+
 }  // namespace tests
 }  // namespace phi

From 2a5590a18e3dd90f815f20a82f6dcc722bc17892 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Wed, 2 Mar 2022 16:55:19 +0800
Subject: [PATCH 18/41] Move BroadcastTensors OP to phi (#40047)

* Move BroadcastTensors OP to phi

* Remove mutable_data in impl

* Move BilinearTensorProductInferMeta to multiary.h/cc
---
 .../fluid/operators/broadcast_tensors_op.cc   |  99 +-----
 .../fluid/operators/broadcast_tensors_op.cu   | 122 --------
 paddle/fluid/operators/broadcast_tensors_op.h | 282 ------------------
 paddle/phi/infermeta/multiary.cc              |  66 +++-
 paddle/phi/infermeta/multiary.h               |   5 +
 .../kernels/broadcast_tensors_grad_kernel.h   |  27 ++
 paddle/phi/kernels/broadcast_tensors_kernel.h |  27 ++
 paddle/phi/kernels/complex_grad_kernel.h      |   2 +-
 paddle/phi/kernels/complex_kernel.h           |  14 +-
 .../cpu/broadcast_tensors_grad_kernel.cc      | 201 +++++++++++++
 .../kernels/cpu/broadcast_tensors_kernel.cc   |  30 ++
 .../gpu/broadcast_tensors_grad_kernel.cu      | 111 +++++++
 .../kernels/gpu/broadcast_tensors_kernel.cu   |  30 ++
 .../impl/broadcast_tensors_kernel_impl.h      | 118 ++++++++
 .../phi/ops/compat/broadcast_tensors_sig.cc   |  28 ++
 15 files changed, 658 insertions(+), 504 deletions(-)
 delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu
 delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.h
 create mode 100644 paddle/phi/kernels/broadcast_tensors_grad_kernel.h
 create mode 100644 paddle/phi/kernels/broadcast_tensors_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
 create mode 100644 paddle/phi/ops/compat/broadcast_tensors_sig.cc

diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
index 27b1107675d4e..c3917fad555cb 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.cc
+++ b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 namespace paddle {
 namespace operators {
@@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
-    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
-                   "broadcast_tensors");
-
-    int target_rank = 0;
-    const auto& input_dims = ctx->GetInputsDim("X");
-
-    // 1. Find Output rank = max(Inputs rank)
-    for (const auto& input_ddim : input_dims) {
-      target_rank = std::max(target_rank, input_ddim.size());
-    }
-
-    PADDLE_ENFORCE_GT(
-        target_rank, 0,
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp requires at least one input tensor"
-            "to have rank greater than zero"));
-
-    std::vector<int64_t> target_dims(target_rank, 0);
-    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
-    for (int index = 0; index < target_rank; index++) {
-      // Loop axes in reverse order,
-      // For each axis, take the maximum as target size
-      // Fill size = 1 if shape vector exhausts
-      int target_dim_size = 1;
-      for (const auto& input_ddim : input_dims) {
-        // Reversed order
-        int axis = static_cast<int>(input_ddim.size()) - index - 1;
-        int dim_size = 1;
-        if (axis >= 0) {
-          dim_size = input_ddim[axis];
-        }
-
-        if (target_dim_size != 1 && dim_size != 1 &&
-            target_dim_size != dim_size) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "BroadcastTensorsOp inputs does not satisfy bcast semantics,"
-              "Please check axis = %d in reverse order",
-              index));
-        }
-
-        // We performed bcast semantics check at python level
-        // So input tensors should all have legal shape
-        target_dim_size = std::max(target_dim_size, dim_size);
-      }
-      target_dims[target_rank - index - 1] = target_dim_size;
-    }
-
-    // 3. Set Output Dim
-    std::vector<DDim> output_ddims;
-    for (size_t i = 0; i < input_dims.size(); i++) {
-      output_ddims.emplace_back(phi::make_ddim(target_dims));
-    }
-    ctx->SetOutputsDim("Out", output_ddims);
-    ctx->ShareAllLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors,
+                            BroadcastTensorsInferShapeFunctor,
+                            PT_INFER_META(phi::BroadcastTensorsInferMeta));
+
 REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
                   ops::BroadcastTensorsOpMaker,
                   ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
                   ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
-                  ops::BroadcastTensorsOpVarTypeInference);
+                  ops::BroadcastTensorsOpVarTypeInference,
+                  BroadcastTensorsInferShapeFunctor);
 
 REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
                   ops::BroadcastTensorsGradOpVarTypeInference,
                   ops::BroadcastTensorsGradNoNeedBufVarsInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    broadcast_tensors_grad,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      plat::float16>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      float>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      double>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
-                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
deleted file mode 100644
index 5882258317d7d..0000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.cu
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/broadcast_tensors_op.h"
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-
-template <typename T>
-class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const DDim& input_dims = input_tensor->dims();
-      const DDim& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // Collect reduce_dims
-      // Example:
-      // dX  = [1,1,1,1]
-      // dOut = [1,1,1,4]
-      //
-      // reduce_dims  = [3] // reduce along the broadcasted axis
-      std::vector<int> reduce_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // Turns out to be a No-Op, simply copy tensors
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        // reduce_sum implementation on CUDA
-        auto stream = context.cuda_device_context().stream();
-        TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-            context.cuda_device_context(), *input_tensor, output_tensor,
-            kps::IdentityFunctor<T>(), reduce_dims_vec, stream);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    broadcast_tensors,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
-                                  int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
-                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
-                        ops::CUDABroadcastTensorsGradOpKernel<float>,
-                        ops::CUDABroadcastTensorsGradOpKernel<double>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int>,
-                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
deleted file mode 100644
index 682f2e2476922..0000000000000
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ /dev/null
@@ -1,282 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define SWITCH_OUT_RANK_CASE(n)                                \
-  case n: {                                                    \
-    ApplyBroadcast<n>(context, in_tensors[i], out_tensors[i]); \
-    break;                                                     \
-  }
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-using framework::DDim;
-using framework::EigenTensor;
-
-template <typename DeviceContext, typename T>
-class BroadcastTensorsOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    const auto& in_tensors = context.MultiInput<Tensor>("X");
-    auto out_tensors = context.MultiOutput<Tensor>("Out");
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // Eigen has no support for dynamic ranked tensor
-    // Thus we perform static expansion for each possible ranks
-    for (size_t i = 0; i < num_ins; i++) {
-      int out_rank = out_tensors[i]->dims().size();
-      switch (out_rank) {
-        SWITCH_OUT_RANK_CASE(1)
-        SWITCH_OUT_RANK_CASE(2)
-        SWITCH_OUT_RANK_CASE(3)
-        SWITCH_OUT_RANK_CASE(4)
-        SWITCH_OUT_RANK_CASE(5)
-        default: {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Target tensor rank out of range"
-              "Maximum supported rank for broadcast is: 5"));
-        }
-      }
-    }
-  }
-
-  template <int OutRank>
-  void ApplyBroadcast(const framework::ExecutionContext& context,
-                      const Tensor* input_tensor, Tensor* output_tensor) const {
-    const auto& input_dims = input_tensor->dims();
-    const auto& output_dims = output_tensor->dims();
-
-    int in_rank = input_dims.size();
-    int out_rank = output_dims.size();
-
-    // 1. Collect bcast_dims, each element of which indicates how many
-    // times we need to replicate along the corresponding dimension
-    // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
-    // both input and output tensors, so we need to initialize input X with
-    // expanded dims: "new_input_dims_vec"
-    Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
-    std::vector<int64_t> new_input_dims_vec(out_rank);
-    for (int j = 0; j < out_rank; j++) {
-      int out_axis = out_rank - j - 1;
-      int in_axis = in_rank - j - 1;
-
-      bcast_dims[out_axis] = output_dims[out_axis];
-      new_input_dims_vec[out_axis] = 1;
-      if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
-        bcast_dims[out_axis] = 1;
-        new_input_dims_vec[out_axis] = input_dims[in_axis];
-      }
-    }
-    auto new_input_dims = phi::make_ddim(new_input_dims_vec);
-
-    // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
-    // output
-    auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
-
-    output_tensor->mutable_data<T>(context.GetPlace());
-    auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(place, y, x,
-                                                                    bcast_dims);
-  }
-};
-
-#define SWITCH_RESHAPE_DIMS(n)                                                \
-  case n: {                                                                   \
-    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
-    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
-      reshape_dims[i] = reshape_dims_vec[i];                                  \
-    }                                                                         \
-    dX.device(place) =                                                        \
-        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
-    break;                                                                    \
-  }
-
-#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
-  case m: {                                               \
-    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
-    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
-      reduce_dims[i] = reduce_dims_vec[i];                \
-    }                                                     \
-    switch (reshape_size) {
-#define LOWER_SWITCH_REDUCE_DIMS                             \
-  default: {                                                 \
-    PADDLE_THROW(platform::errors::InvalidArgument(          \
-        "Detected reshape size: %d out of range"             \
-        "Minimum value should be larger than reduce size %d" \
-        "While maximum supported is: 5",                     \
-        reshape_size, reduce_size));                         \
-  }                                                          \
-    }                                                        \
-    break;                                                   \
-    }
-
-/* ----- GradOpKernel ----- */
-template <typename DeviceContext, typename T>
-class BroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // Find reduce dimensions
-    const auto& in_tensors =
-        context.MultiInput<Tensor>(framework::GradVarName("Out"));
-    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-
-    size_t num_ins = in_tensors.size();
-
-    PADDLE_ENFORCE_GT(
-        num_ins, 1,
-        platform::errors::InvalidArgument(
-            "Expected at least 2 input tensors, but only received d%.",
-            in_tensors.size()));
-
-    PADDLE_ENFORCE_EQ(
-        num_ins, out_tensors.size(),
-        platform::errors::InvalidArgument(
-            "BroadcastTensorsOp expects equal number of inputs and outputs,"
-            "but received: %d inputs v.s %d outputs",
-            num_ins, out_tensors.size()));
-
-    // For each In-Out tensor pair,
-    // Prepare and apply broadcast dims array
-    for (size_t i = 0; i < num_ins; i++) {
-      const auto* input_tensor = in_tensors[i];
-      auto* output_tensor = out_tensors[i];
-
-      const auto& input_dims = input_tensor->dims();
-      const auto& output_dims = output_tensor->dims();
-
-      int in_rank = input_dims.size();
-      int out_rank = output_dims.size();
-
-      // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
-      // Here we perform the following Eigen operations:
-      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-      // reshape(dX_shape) -> dX
-      // Note the last "reshape(dX_shape)" will be performed implicitly,
-      // and we only need to collect reduce_dims and reshape_dims
-      std::vector<int> reduce_dims_vec;
-      std::vector<int> reshape_dims_vec;
-      for (int j = 0; j < in_rank; j++) {
-        int out_axis = out_rank - j - 1;
-        int in_axis = in_rank - j - 1;
-
-        reshape_dims_vec.push_back(input_dims[j]);
-        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
-          reduce_dims_vec.push_back(in_axis);
-        }
-      }
-
-      size_t reduce_size = reduce_dims_vec.size();
-      size_t reshape_size = reshape_dims_vec.size();
-      bool just_copy = (reduce_dims_vec.size() == 0);
-      output_tensor->mutable_data<T>(context.GetPlace());
-      if (just_copy) {
-        // If this turns out to be a No-Op, simply perform a tensor copy
-        framework::TensorCopy(*input_tensor, context.GetPlace(),
-                              context.device_context(), output_tensor);
-      } else {
-        PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1,
-                          platform::errors::InvalidArgument(
-                              "The number of dimensions of the input "
-                              "'Out@GRAD' for Op(broadcast_tensors)"
-                              " must be greater than or equal to 1, but "
-                              "the value received is %d.",
-                              reduce_dims_vec.size()));
-        PADDLE_ENFORCE_LE(
-            reduce_dims_vec.size(), 5,
-            platform::errors::InvalidArgument(
-                "The number of dimensions of the input 'Out@GRAD' "
-                "for Op(broadcast_tensors) must be less than or equal "
-                "to 5, but the value received is %d.",
-                reduce_dims_vec.size()));
-
-        // Overall:
-        // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
-        // reshape(dX_shape) -> dX
-        auto dX = framework::EigenVector<T>::Flatten(*output_tensor);
-        auto dOut = framework::EigenVector<T>::Flatten(*input_tensor);
-        auto& place =
-            *context.template device_context<DeviceContext>().eigen_device();
-
-        // Expand ReduceSize and ReshapeSize into static values
-        switch (reduce_size) {
-          UPPER_SWITCH_REDUCE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(1)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(2)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(3)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(4)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          UPPER_SWITCH_REDUCE_DIMS(5)
-          SWITCH_RESHAPE_DIMS(5)
-          LOWER_SWITCH_REDUCE_DIMS
-
-          default: {
-            PADDLE_THROW(platform::errors::InvalidArgument(
-                "Detected reduce size: %d out of range"
-                "While maximum supported is: 5",
-                reduce_size));
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7634e5e01aca4..dc5478e8afb98 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -13,11 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/multiary.h"
-
+#include <vector>
 #include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors) {
+  std::vector<DDim> dims;
+  dims.reserve(tensors.size());
+  for (const MetaTensor* tensor : tensors) {
+    dims.emplace_back(tensor->dims());
+  }
+  return dims;
+}
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out) {
+  int target_rank = 0;
+  const auto& input_dims = GetMetaTensorsDim(x);
+
+  // 1. Find Output rank = max(Inputs rank)
+  for (const auto& input_ddim : input_dims) {
+    target_rank = std::max(target_rank, input_ddim.size());
+  }
+
+  PADDLE_ENFORCE_GT(target_rank,
+                    0,
+                    errors::InvalidArgument("BroadcastTensorsOp requires at "
+                                            "least one input tensor to have "
+                                            "rank greater than zero"));
+
+  std::vector<int64_t> target_dims(target_rank, 0);
+  // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+  for (int index = 0; index < target_rank; index++) {
+    // Loop axes in reverse order,
+    // For each axis, take the maximum as target size
+    // Fill size = 1 if shape vector exhausts
+    int target_dim_size = 1;
+    for (const auto& input_ddim : input_dims) {
+      // Reversed order
+      int axis = static_cast<int>(input_ddim.size()) - index - 1;
+      int dim_size = 1;
+      if (axis >= 0) {
+        dim_size = input_ddim[axis];
+      }
+
+      if (target_dim_size != 1 && dim_size != 1 &&
+          target_dim_size != dim_size) {
+        PADDLE_THROW(errors::InvalidArgument(
+            "BroadcastTensorsOp inputs does not satisfy bcast semantics, "
+            "please check axis = %d in reverse order",
+            index));
+      }
+
+      // We performed bcast semantics check at python level
+      // So input tensors should all have legal shape
+      target_dim_size = std::max(target_dim_size, dim_size);
+    }
+    target_dims[target_rank - index - 1] = target_dim_size;
+  }
+
+  // 3. Set Output Dim
+  for (size_t i = 0; i < out.size(); i++) {
+    out[i]->set_dims(phi::make_ddim(target_dims));
+    out[i]->share_lod(*(x[i]));
+    out[i]->set_dtype(x[i]->dtype());
+  }
+}
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 2afb79daa355c..51738c5e08e98 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 namespace phi {
 
+std::vector<DDim> GetMetaTensorsDim(const std::vector<MetaTensor*>& tensors);
+
 void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     const MetaTensor& y,
                                     const MetaTensor& weight,
@@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
+void BroadcastTensorsInferMeta(const std::vector<MetaTensor*>& x,
+                               std::vector<MetaTensor*> out);
+
 void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
new file mode 100644
index 0000000000000..5ec2e35cc9b0c
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h
new file mode 100644
index 0000000000000..fb2a6f1136c26
--- /dev/null
+++ b/paddle/phi/kernels/broadcast_tensors_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 505d4d3744241..be13e2826ea81 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 44bfae9820aa8..3b3003392d37f 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return x;
 }
 
-template <typename T, typename DeviceContext>
-void RealKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
-
-template <typename T, typename DeviceContext>
-void ImagKernel(const DeviceContext& dev_ctx,
-                const DenseTensor& x,
-                DenseTensor* out);
+template <typename T, typename Context>
+void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
+template <typename T, typename Context>
+void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
new file mode 100644
index 0000000000000..7a97f8c218973
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_RESHAPE_DIMS(n)                                                \
+  case n: {                                                                   \
+    Eigen::DSizes<Eigen::DenseIndex, n> reshape_dims;                         \
+    for (size_t i = 0; i < reshape_dims_vec.size(); ++i) {                    \
+      reshape_dims[i] = reshape_dims_vec[i];                                  \
+    }                                                                         \
+    dX.device(place) =                                                        \
+        dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \
+    break;                                                                    \
+  }
+
+#define UPPER_SWITCH_REDUCE_DIMS(m)                       \
+  case m: {                                               \
+    Eigen::DSizes<Eigen::DenseIndex, m> reduce_dims;      \
+    for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \
+      reduce_dims[i] = reduce_dims_vec[i];                \
+    }                                                     \
+    switch (reshape_size) {
+#define LOWER_SWITCH_REDUCE_DIMS                             \
+  default: {                                                 \
+    PADDLE_THROW(errors::InvalidArgument(                    \
+        "Detected reshape size: %d out of range"             \
+        "Minimum value should be larger than reduce size %d" \
+        "While maximum supported is: 5",                     \
+        reshape_size,                                        \
+        reduce_size));                                       \
+  }                                                          \
+    }                                                        \
+    break;                                                   \
+    }
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs, but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    const auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const auto& input_dims = input_tensor->dims();
+    const auto& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes
+    // Here we perform the following Eigen operations:
+    // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+    // reshape(dX_shape) -> dX
+    // Note the last "reshape(dX_shape)" will be performed implicitly,
+    // and we only need to collect reduce_dims and reshape_dims
+    std::vector<int> reduce_dims_vec;
+    std::vector<int> reshape_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      reshape_dims_vec.push_back(input_dims[j]);
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    size_t reduce_size = reduce_dims_vec.size();
+    size_t reshape_size = reshape_dims_vec.size();
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // If this turns out to be a No-Op, simply perform a tensor copy
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      PADDLE_ENFORCE_GE(
+          reduce_dims_vec.size(),
+          1,
+          errors::InvalidArgument("The number of dimensions of the input "
+                                  "'Out@GRAD' for Op(broadcast_tensors)"
+                                  " must be greater than or equal to 1, but "
+                                  "the value received is %d.",
+                                  reduce_dims_vec.size()));
+      PADDLE_ENFORCE_LE(
+          reduce_dims_vec.size(),
+          5,
+          errors::InvalidArgument(
+              "The number of dimensions of the input 'Out@GRAD' "
+              "for Op(broadcast_tensors) must be less than or equal "
+              "to 5, but the value received is %d.",
+              reduce_dims_vec.size()));
+
+      // Overall:
+      // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) ->
+      // reshape(dX_shape) -> dX
+      auto dX = EigenVector<T>::Flatten(*output_tensor);
+      auto dOut = EigenVector<T>::Flatten(*input_tensor);
+      auto& place = *ctx.eigen_device();
+
+      // Expand ReduceSize and ReshapeSize into static values
+      switch (reduce_size) {
+        UPPER_SWITCH_REDUCE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(1)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(2)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(3)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(4)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        UPPER_SWITCH_REDUCE_DIMS(5)
+        SWITCH_RESHAPE_DIMS(5)
+        LOWER_SWITCH_REDUCE_DIMS
+
+        default: {
+          PADDLE_THROW(
+              errors::InvalidArgument("Detected reduce size: %d out of range"
+                                      "While maximum supported is: 5",
+                                      reduce_size));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
new file mode 100644
index 0000000000000..4cb6db8769271
--- /dev/null
+++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
new file mode 100644
index 0000000000000..6fb24d72145c6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h"
+
+#include <vector>
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/primitive/functor_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BroadcastTensorsGradKernel(const Context& ctx,
+                                const std::vector<DenseTensor>& dout,
+                                std::vector<DenseTensor*> dx) {
+  // Find reduce dimensions
+  const auto& in_tensors = dout;
+  auto& out_tensors = dx;
+
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(
+      num_ins,
+      out_tensors.size(),
+      errors::InvalidArgument(
+          "BroadcastTensorsOp expects equal number of inputs and outputs,"
+          "but received: %d inputs v.s %d outputs",
+          num_ins,
+          out_tensors.size()));
+
+  // For each In-Out tensor pair,
+  // Prepare and apply broadcast dims array
+  for (size_t i = 0; i < num_ins; i++) {
+    auto* input_tensor = &in_tensors[i];
+    auto* output_tensor = out_tensors[i];
+
+    const DDim& input_dims = input_tensor->dims();
+    const DDim& output_dims = output_tensor->dims();
+
+    int in_rank = input_dims.size();
+    int out_rank = output_dims.size();
+
+    // Collect reduce_dims
+    // Example:
+    // dX  = [1,1,1,1]
+    // dOut = [1,1,1,4]
+    //
+    // reduce_dims  = [3] // reduce along the broadcasted axis
+    std::vector<int> reduce_dims_vec;
+    for (int j = 0; j < in_rank; j++) {
+      int out_axis = out_rank - j - 1;
+      int in_axis = in_rank - j - 1;
+
+      if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+        reduce_dims_vec.push_back(in_axis);
+      }
+    }
+
+    bool just_copy = (reduce_dims_vec.size() == 0);
+    ctx.template Alloc<T>(output_tensor);
+    if (just_copy) {
+      // Turns out to be a No-Op, simply copy tensors
+      paddle::framework::TensorCopy(
+          *input_tensor, ctx.GetPlace(), ctx, output_tensor);
+    } else {
+      // reduce_sum implementation on CUDA
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx,
+          *input_tensor,
+          output_tensor,
+          kps::IdentityFunctor<T>(),
+          reduce_dims_vec,
+          ctx.stream());
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(broadcast_tensors_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsGradKernel,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
new file mode 100644
index 0000000000000..aa45bd3c43891
--- /dev/null
+++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h"
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(broadcast_tensors,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BroadcastTensorsKernel,
+                   bool,
+                   int,
+                   int64_t,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
new file mode 100644
index 0000000000000..eb01b83377cb6
--- /dev/null
+++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/broadcast_tensors_kernel.h"
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#define SWITCH_OUT_RANK_CASE(n)                                         \
+  case n: {                                                             \
+    ApplyBroadcast<T, Context, n>(ctx, &in_tensors[i], out_tensors[i]); \
+    break;                                                              \
+  }
+
+namespace phi {
+
+template <typename T, typename Context, int OutRank>
+void ApplyBroadcast(const Context& ctx,
+                    const DenseTensor* input_tensor,
+                    DenseTensor* output_tensor) {
+  const auto& input_dims = input_tensor->dims();
+  const auto& output_dims = output_tensor->dims();
+
+  int in_rank = input_dims.size();
+  int out_rank = output_dims.size();
+
+  // 1. Collect bcast_dims, each element of which indicates how many
+  // times we need to replicate along the corresponding dimension
+  // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for
+  // both input and output tensors, so we need to initialize input X with
+  // expanded dims: "new_input_dims_vec"
+  Eigen::DSizes<Eigen::DenseIndex, OutRank> bcast_dims;
+  std::vector<int64_t> new_input_dims_vec(out_rank);
+  for (int j = 0; j < out_rank; j++) {
+    int out_axis = out_rank - j - 1;
+    int in_axis = in_rank - j - 1;
+
+    bcast_dims[out_axis] = output_dims[out_axis];
+    new_input_dims_vec[out_axis] = 1;
+    if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) {
+      bcast_dims[out_axis] = 1;
+      new_input_dims_vec[out_axis] = input_dims[in_axis];
+    }
+  }
+  auto new_input_dims = phi::make_ddim(new_input_dims_vec);
+
+  // Initialize input X with new_input_dims_vec, so it's rank-aligned with the
+  // output
+  auto x = EigenTensor<T, OutRank>::From(*input_tensor, new_input_dims);
+
+  ctx.template Alloc<T>(output_tensor);
+  auto y = EigenTensor<T, OutRank>::From(*output_tensor, output_dims);
+
+  auto& place = *ctx.eigen_device();
+  funcs::EigenBroadcast<std::decay_t<decltype(place)>, T, OutRank>::Eval(
+      place, y, x, bcast_dims);
+}
+
+template <typename T, typename Context>
+void BroadcastTensorsKernel(const Context& ctx,
+                            const std::vector<DenseTensor>& x,
+                            std::vector<DenseTensor*> out) {
+  const auto& in_tensors = x;
+  auto out_tensors = out;
+  size_t num_ins = in_tensors.size();
+
+  PADDLE_ENFORCE_GT(
+      num_ins,
+      1,
+      errors::InvalidArgument(
+          "Expected at least 2 input tensors, but only received d%.",
+          in_tensors.size()));
+
+  PADDLE_ENFORCE_EQ(num_ins,
+                    out_tensors.size(),
+                    errors::InvalidArgument(
+                        "BroadcastTensorsOp expects equal number of inputs and "
+                        "outputs,but received: %d inputs v.s %d outputs",
+                        num_ins,
+                        out_tensors.size()));
+
+  // Eigen has no support for dynamic ranked tensor
+  // Thus we perform static expansion for each possible ranks
+  for (size_t i = 0; i < num_ins; i++) {
+    int out_rank = out_tensors[i]->dims().size();
+    switch (out_rank) {
+      SWITCH_OUT_RANK_CASE(1)
+      SWITCH_OUT_RANK_CASE(2)
+      SWITCH_OUT_RANK_CASE(3)
+      SWITCH_OUT_RANK_CASE(4)
+      SWITCH_OUT_RANK_CASE(5)
+      default: {
+        PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+            "Target tensor rank out of range"
+            "Maximum supported rank for broadcast is: 5"));
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
new file mode 100644
index 0000000000000..2c979c4aedcc8
--- /dev/null
+++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BroadcastTensorsGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad,
+                           phi::BroadcastTensorsGradOpArgumentMapping);

From 7a857924570084851be8b6094f181f217d58fb7c Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 2 Mar 2022 17:18:53 +0800
Subject: [PATCH 19/41] Move transpose to pten (#39327)

* immigrate_transpose_to_pten cpu kernel only; test=develop

* fix bug; test=develop

* add transpose cuda api

* bug fix;

* fix bugs

* fix bugs; test=develop

* bug fix;

* move transepose to pten; test=develop

* fix bug; test=develop

* fix bugs; test=develop

* add transpose grad fp16 support; test=develop

* fix bug; test=develop

* fix npu bug; test=develop

* fix nemul = 0 bug; test=develop

* add fp16 support; test=develop

* fix data type register bug; test=develop

* fix transpose bug; test=develop

* update transpose

* fix transpose bug; test=develop

* remove useless code; test=develop

* remove useless code; test=develop

* fix transpose alias bug; test=develop

* polish code; test=develop

* resolve confict; test=develop

* resolve confilct; test=develop

* recover prepared operator; test=develop

* fix bug; test=develop

* polish code; test=develop

* fix bug; test=develop

* fix bug; test=develop
---
 .../operators/mkldnn/test_mkldnn_op_nhwc.cc   |   2 +-
 paddle/fluid/operators/transpose_op.cc        |  60 ++------
 paddle/fluid/operators/transpose_op.cu        | 139 ------------------
 paddle/fluid/operators/transpose_op.cu.h      |  42 +++---
 paddle/fluid/operators/transpose_op.h         |  58 --------
 .../fluid/operators/transpose_op_npu_test.cc  |   2 +-
 .../phi/kernels/cpu/transpose_grad_kernel.cc  |  32 ++++
 paddle/phi/kernels/cpu/transpose_kernel.cc    |  80 ++++++++++
 paddle/phi/kernels/funcs/math_function.cu     |  51 +++++++
 .../phi/kernels/gpu/transpose_grad_kernel.cu  |  34 +++++
 paddle/phi/kernels/gpu/transpose_kernel.cu    |  57 +++++++
 .../kernels/impl/transpose_grad_kernel_impl.h |  38 +++++
 paddle/phi/kernels/transpose_grad_kernel.h    |  28 ++++
 paddle/phi/kernels/transpose_kernel.h         |  28 ++++
 paddle/phi/ops/compat/transpose_sig.cc        |  38 +++++
 .../unittests/parallel_executor_test_base.py  |   2 +-
 ..._imperative_lod_tensor_to_selected_rows.py |   1 +
 .../test_parallel_executor_transformer.py     |   1 +
 ...test_partial_eager_deletion_transformer.py |   2 +
 .../tests/unittests/test_transpose_op.py      |   1 +
 20 files changed, 426 insertions(+), 270 deletions(-)
 delete mode 100644 paddle/fluid/operators/transpose_op.cu
 create mode 100644 paddle/phi/kernels/cpu/transpose_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/transpose_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/transpose_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/transpose_kernel.cu
 create mode 100644 paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
 create mode 100644 paddle/phi/kernels/transpose_grad_kernel.h
 create mode 100644 paddle/phi/kernels/transpose_kernel.h
 create mode 100644 paddle/phi/ops/compat/transpose_sig.cc

diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index 52e2caaeb6ee1..3791fed23a84f 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -29,7 +29,7 @@ USE_OP(pool2d);
 USE_OP_DEVICE_KERNEL(pool2d, MKLDNN);
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MKLDNN);
-USE_OP(transpose);
+USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 
 namespace paddle {
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 768ab21936f1e..1a297e7238ccd 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class TransposeGradInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SyncTypeAndDataType(framework::GradVarName("Out"),
+                             framework::GradVarName("X"));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -347,59 +355,13 @@ REGISTER_OPERATOR(
     transpose, ops::TransposeOp, ops::TransposeOpMaker,
     paddle::framework::DefaultGradOpMaker<paddle::framework::OpDesc, true>,
     paddle::framework::DefaultGradOpMaker<paddle::imperative::OpBase, true>);
-REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
+REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad,
+                  ops::TransposeGradInferVarType);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2GradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad,
+                  ops::TransposeGradInferVarType,
                   ops::Transpose2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Transpose2DoubleGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    transpose2, ops::TransposeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<float>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex<double>>,
-    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
deleted file mode 100644
index 02e224549a5ab..0000000000000
--- a/paddle/fluid/operators/transpose_op.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/transpose_op.cu.h"
-#include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class TransposeGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *x_tensor, axis, out_tensor);
-  }
-};
-template <typename DeviceContext, typename T>
-class TransposeGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-    if (!x_grad) {
-      return;
-    }
-
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    const auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransposeGPUKernelDriver<T>(dev_ctx, ndims, *out_grad_tensor, reversed_axis,
-                                x_grad_tensor);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
-
-REGISTER_OP_CUDA_KERNEL(
-    transpose2,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    transpose2_grad,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::float16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                plat::bfloat16>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<float>>,
-    ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index b542fa37f88fd..a31ac28c9910c 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,8 +16,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -258,10 +259,10 @@ struct SystemElemType<16> {
 };
 
 template <typename T, int tile_long, int tile_short>
-void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
-                                      int tile_size_i, int tile_size_j,
-                                      int total_tiles_count, const T* input,
-                                      const Dim3& input_dims, T* output) {
+void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i,
+                                      int tile_size_j, int total_tiles_count,
+                                      const T* input, const Dim3& input_dims,
+                                      T* output) {
   constexpr int NumThreads = tile_long;
   if (tile_size_i <= tile_long && tile_size_j <= tile_short) {
     TilingSwapDim1And2<
@@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d,
 
 template <typename T, int tile_long, int tile_short, typename dummy = void>
 struct NarrowDims2TransposeDispatch {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<
         CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch<
     T, tile_long, tile_short,
     typename std::enable_if<CheckLongTileSize(tile_long, tile_short, sizeof(T)),
                             void>::type> {
-  static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i,
+  static void DoTranspose(const phi::GPUContext& d, int tile_size_i,
                           int tile_size_j, int total_tiles_count,
                           const T* input, const Dim3& input_dims, T* output) {
     PADDLE_ENFORCE_EQ(
@@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch<
 };
 
 template <typename T, bool conjugate = false>
-void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input,
+void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input,
                           const Dim3& input_dims, T* output,
                           const int kMinTileSize) {
   // First get available tile sizes for the data type requested as backups
@@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input,
 
 // Here suppose convert all tensor to dim3, so just change dim1 and 2.
 template <typename T>
-void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
-                                 const T* input, const Dim3& input_dims,
-                                 T* output) {
+void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input,
+                                 const Dim3& input_dims, T* output) {
   // Suppose tile size > 16
   static const int kMinTileSize = 16;
   static const int kMinNarrowTileSize = 96;
@@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
   } else {
     // If input shape is small, such as 8X8, just do simple copy
     int total_elements = input_dims[0] * input_dims[1] * input_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_elements);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements);
     TransposeSimpleKernel<T, 0, 2, 1><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
         total_elements, input, input_dims, output);
@@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d,
 
 template <typename T>
 struct SwapDim1And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose {
 
 template <typename T>
 struct SwapDim0And2InTranspose {
-  typedef platform::CUDADeviceContext Device;
+  typedef phi::GPUContext Device;
   void operator()(const Device& d, const T* in,
                   const std::vector<int>& combined_dims, T* out) {
     Dim3 input_dims = {static_cast<int>(combined_dims[0]),
@@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose {
                        static_cast<int>(combined_dims[2])};
 
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
-    auto config = GetGpuLaunchConfig1D(d, total_size);
+    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size);
 
     TransposeSimpleKernel<T, 2, 1, 0><<<
         config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>(
@@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape,
 
 template <typename T>
 struct TransposeSimple {
-  static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in,
+  static bool run(const phi::GPUContext& ctx, const Tensor& in,
                   const std::vector<int32_t> perm, Tensor* out) {
     // First reduce the dimensions of the input tensor if possible.
     std::vector<int> new_perm;
@@ -654,12 +654,12 @@ struct TransposeSimple {
 };
 
 template <typename T>
-void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
-                              const int ndims, const Tensor& in,
-                              const std::vector<int32_t> perm, Tensor* out) {
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
+                              const Tensor& in,
+                              const std::vector<int32_t>& perm, Tensor* out) {
   auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
   if (!ret) {
-    TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, in, out, perm);
+    TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
   }
 }
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index ec05a534c0ef5..a9e4876cc82a4 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
   }
 }
 
-template <typename DeviceContext, typename T>
-class TransposeKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.InputVar("X");
-    auto* out = context.OutputVar("Out");
-
-    const framework::Tensor* x_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*x);
-    framework::Tensor* out_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(out);
-
-    out_tensor->mutable_data<T>(context.GetPlace());
-    if (out_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x_tensor, out_tensor, axis);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class TransposeGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* out_grad = context.InputVar(framework::GradVarName("Out"));
-    auto* x_grad = context.OutputVar(framework::GradVarName("X"));
-
-    if (!x_grad) {
-      return;
-    }
-    const framework::Tensor* out_grad_tensor =
-        GetLoDTensorOrSelectedRowsValueFromVar(*out_grad);
-    framework::Tensor* x_grad_tensor =
-        GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad);
-
-    x_grad_tensor->mutable_data<T>(context.GetPlace());
-    if (x_grad_tensor->numel() == 0) {
-      return;
-    }
-
-    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
-
-    int ndims = axis.size();
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad_tensor,
-                                   x_grad_tensor, reversed_axis);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index cce3f188c8b74..5617d728a51dc 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_DEVICE_KERNEL(transpose2, NPU);
 
 template <typename T>
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
new file mode 100644
index 0000000000000..9dbcf575f33c1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
new file mode 100644
index 0000000000000..a80196e7f80e1
--- /dev/null
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_kernel.h"
+#include <vector>
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  int rank = axis.size();
+  switch (rank) {
+    case 1:
+      funcs::Transpose<Context, T, 1> trans1;
+      trans1(ctx, x, out, axis);
+      break;
+    case 2:
+      funcs::Transpose<Context, T, 2> trans2;
+      trans2(ctx, x, out, axis);
+      break;
+    case 3:
+      funcs::Transpose<Context, T, 3> trans3;
+      trans3(ctx, x, out, axis);
+      break;
+    case 4:
+      funcs::Transpose<Context, T, 4> trans4;
+      trans4(ctx, x, out, axis);
+      break;
+    case 5:
+      funcs::Transpose<Context, T, 5> trans5;
+      trans5(ctx, x, out, axis);
+      break;
+    case 6:
+      funcs::Transpose<Context, T, 6> trans6;
+      trans6(ctx, x, out, axis);
+      break;
+    default:
+      // for rank >= 7 situation
+      funcs::TransposeNormal<Context, T> trans_normal;
+      trans_normal(ctx, x, out, axis);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu
index ae368a005f057..df2af82d551ee 100644
--- a/paddle/phi/kernels/funcs/math_function.cu
+++ b/paddle/phi/kernels/funcs/math_function.cu
@@ -187,6 +187,57 @@ void TransposeNormal<DeviceContext, T>::operator()(
       in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank);
 }
 
+template <typename T>
+struct TransposeNormal<phi::GPUContext, T> {
+  void operator()(const phi::GPUContext& context,
+                  const DenseTensor& in,
+                  DenseTensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = stride(in.dims());
+    auto out_stride = stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const phi::GPUPlace& cuda_place = context.GetPlace();
+    phi::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
+    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    paddle::memory::Copy(
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr,
+        out_ptr,
+        elements,
+        in_stride_ptr,
+        out_stride_ptr,
+        axis_ptr,
+        rank);
+  }
+};
+
 // define transpose normal
 #define DEFINE_GPU_TRANS_NORMAL(TYPE)                                         \
   template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>; \
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
new file mode 100644
index 0000000000000..0687dc0c200a8
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(transpose_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeGradKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu
new file mode 100644
index 0000000000000..9ea2af292ccf1
--- /dev/null
+++ b/paddle/phi/kernels/gpu/transpose_kernel.cu
@@ -0,0 +1,57 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "paddle/phi/api/ext/dispatch.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+#include "paddle/fluid/framework/gpu_utils.h"
+#include "paddle/fluid/operators/transpose_op.cu.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TransposeKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out) {
+  int rank = axis.size();
+  ctx.template Alloc<T>(out);
+  if (out->numel() == 0) {
+    return;
+  }
+  paddle::operators::TransposeGPUKernelDriver<T>(ctx, rank, x, axis, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TransposeKernel,
+                   bool,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
new file mode 100644
index 0000000000000..6bb555fe28f11
--- /dev/null
+++ b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  std::vector<int> reversed_axis(axis);
+
+  dev_ctx.template Alloc<T>(x_grad);
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+
+  TransposeKernel<T, Context>(dev_ctx, out_grad, reversed_axis, x_grad);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h
new file mode 100644
index 0000000000000..33d4ca7e3c6c2
--- /dev/null
+++ b/paddle/phi/kernels/transpose_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
new file mode 100644
index 0000000000000..303b4a9a8f05d
--- /dev/null
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void TransposeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const std::vector<int>& axis,
+                     DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc
new file mode 100644
index 0000000000000..90961760cfc66
--- /dev/null
+++ b/paddle/phi/ops/compat/transpose_sig.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("transpose", {"X"}, {"axis"}, {"Out"});
+}
+
+KernelSignature TransposeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(transpose2, transpose);
+PD_REGISTER_BASE_KERNEL_NAME(transpose2_grad, transpose_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(transpose2, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose2_grad,
+                           phi::TransposeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose, phi::TransposeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(transpose_grad, phi::TransposeGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 2a8f72c217055..2633a5992563f 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -43,7 +43,7 @@ def check_network_convergence(cls,
                                   get_data_from_feeder=None,
                                   use_parallel_executor=True,
                                   use_reduce=False,
-                                  use_ir_memory_optimize=True,
+                                  use_ir_memory_optimize=False,
                                   enable_inplace=True,
                                   fuse_elewise_add_act_ops=False,
                                   fuse_all_optimizer_ops=False,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index d54194164a559..110bb961bbe12 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -207,4 +207,5 @@ def simple_net_float32(self, is_sparse, dtype):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index 1cb39eb131b82..b87e8d4e3c21a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -206,4 +206,5 @@ def test_main(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
index 1661f753a8464..15d9e0e2daa5e 100644
--- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py
@@ -14,10 +14,12 @@
 
 import unittest
 import paddle.fluid as fluid
+import paddle
 
 fluid.core._set_eager_deletion_mode(0.0, 0.55, True)
 
 from test_parallel_executor_transformer import TestTransformer
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 13b880b28bf85..1e6b4354dd9c8 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -463,4 +463,5 @@ def test_error(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()

From 66196573ffe73bd3e02a4f713e2b2578bbf601aa Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 2 Mar 2022 17:50:32 +0800
Subject: [PATCH 20/41] [XPU] Fix Phi Kernel cache problem in operator.cc
 (#40044)

* [XPU] Fix Phi Kernel cache problem in operator.cc

* fix typo
---
 paddle/fluid/framework/operator.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b91ee3c2d633d..ffdc3e6d3c2bc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1210,6 +1210,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name
                 << "` not found.";
       }
+    } else {
+      pt_kernel_name = pt_kernel_signature_->name;
+      pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get());
     }
 #ifdef PADDLE_WITH_XPU
     bool is_xpu_unsupport =

From 5898e9abecc05bc039e29838ec4b8fb49ae2d3f0 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 2 Mar 2022 18:25:54 +0800
Subject: [PATCH 21/41] [Phi]Move elementwise function to funcs directory 
 (#39986)

* move elementwise function to funcs directory

* fix compile bugs

* modify according to comment
---
 .../elementwise/elementwise_add_op.kps        |   2 +-
 .../elementwise/elementwise_op_broadcast.cu.h |   3 -
 .../elementwise/elementwise_op_function.h     |  29 +-
 .../elementwise/elementwise_op_impl.cu.h      |   2 +-
 paddle/fluid/operators/viterbi_decode_op.h    |  12 +-
 paddle/phi/kernels/cpu/elementwise.h          | 619 +----------------
 paddle/phi/kernels/cpu/elementwise_grad.h     | 146 ++++
 .../kernels/cpu/elementwise_grad_kernel.cc    |  27 +-
 paddle/phi/kernels/cpu/logical_kernel.cc      |  20 +-
 paddle/phi/kernels/cpu/math_kernel.cc         |   9 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  18 +-
 paddle/phi/kernels/funcs/elementwise_base.h   | 285 ++++----
 .../elementwise_grad_base.h}                  | 655 +++++++++++-------
 paddle/phi/kernels/funcs/elementwise_utils.h  | 121 ++++
 paddle/phi/kernels/gpu/elementwise_grad.h     | 246 +++++++
 .../kernels/gpu/elementwise_grad_kernel.cu    |  27 +-
 paddle/phi/kernels/gpu/logical_kernel.cu      |   3 +-
 paddle/phi/kernels/gpu/math_kernel.cu         |   2 +-
 .../impl/elementwise_grad_kernel_impl.h       |  33 +-
 19 files changed, 1149 insertions(+), 1110 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/elementwise_grad.h
 rename paddle/phi/kernels/{gpu/elementwise.h => funcs/elementwise_grad_base.h} (78%)
 create mode 100644 paddle/phi/kernels/funcs/elementwise_utils.h
 create mode 100644 paddle/phi/kernels/gpu/elementwise_grad.h

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
index d6e0749318e90..3b7457d72e15d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #else
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #endif
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 418779c32e8bc..102127e6ffe4e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -16,9 +16,6 @@
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/phi/kernels/gpu/elementwise.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index a1a7f83109866..61862aa9f8740 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -31,6 +31,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
@@ -133,7 +134,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
 
 inline framework::DDim trim_trailing_singular_dims(
     const framework::DDim &dims) {
-  return phi::funcs::trim_trailing_singular_dims(dims);
+  return phi::funcs::TrimTrailingSingularDims(dims);
 }
 
 template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
@@ -152,7 +153,7 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx,
                                                Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   } else {
-    phi::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
+    phi::funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP, Tout>(
         dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
   }
 }
@@ -173,19 +174,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx,
                           const framework::Tensor *y, int axis, Functor func,
                           framework::Tensor *z) {
   z->mutable_data<OutType>(ctx.GetPlace());
-  if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
-    const auto &dev_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func,
-                                                 z);
-
-#endif
-    return;
-  }
-  const auto &dev_ctx =
-      ctx.template device_context<platform::CPUDeviceContext>();
-  phi::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis, func, z);
+  const auto &dev_ctx = ctx.template device_context<DeviceContext>();
+  phi::funcs::ElementwiseCompute<Functor, T, OutType>(dev_ctx, *x, *y, axis,
+                                                      func, z);
 }
 
 // FusedElemwiseAndAct
@@ -443,8 +434,8 @@ void FusedElemwiseAndActComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   if (post == 1) {
     int h = pre;
     int w = n;
@@ -991,8 +982,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
   axis = (y_dim.size() == 0) ? x_dim.size() : axis;
 
   int pre, n, post, is_run_common_broadcast;
-  phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post,
-                           &is_run_common_broadcast);
+  phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post,
+                         &is_run_common_broadcast);
   const T *x_data = nullptr;
   const T *y_data = nullptr;
   if (x->IsInitialized()) x_data = x->data<T>();
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 7d7bb4f26fcf4..f49e2ab4e173e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index 8f01a0c36043b..bf12a03e7b4dc 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -151,12 +151,12 @@ struct GetInputIndex<false> {
                   const std::vector<int>& output_strides, int output_idx,
                   int* index_array, int* lhs_idx, int* rhs_idx) {
     int out_dims_size = output_strides.size();
-    *lhs_idx =
-        phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array);
-    *rhs_idx =
-        phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array);
-    phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
-                                     index_array);
+    *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size,
+                                               index_array);
+    *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size,
+                                               index_array);
+    phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size,
+                                            index_array);
   }
 };
 
diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h
index 28bf5ab743f6d..0f67df661136d 100644
--- a/paddle/phi/kernels/cpu/elementwise.h
+++ b/paddle/phi/kernels/cpu/elementwise.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
 
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor<
   }
 };
 
-inline void UpdateElementwiseIndexArray(const int* out_dims_array,
-                                        const int max_dim,
-                                        int* index_array) {
-  for (int i = max_dim - 1; i >= 0; --i) {
-    ++index_array[i];
-    if (index_array[i] >= out_dims_array[i]) {
-      index_array[i] -= out_dims_array[i];
-    } else {
-      break;
-    }
-  }
-}
-
-inline int GetElementwiseIndex(const int* x_dims_array,
-                               const int max_dim,
-                               const int* index_array) {
-  int index_ = 0;
-  for (int i = 0; i < max_dim; i++) {
-    if (x_dims_array[i] > 1) {
-      index_ = index_ * x_dims_array[i] + index_array[i];
-    }
-  }
-  return index_;
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonGradBroadcastCPU(const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& out,
-                            const DenseTensor& dout,
-                            DenseTensor* dx,
-                            DenseTensor* dy,
-                            int* x_dims_array,
-                            int* y_dims_array,
-                            int* out_dims_array,
-                            int max_dim,
-                            const CPUContext& ctx,
-                            DX_OP dx_op,
-                            DY_OP dy_op) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  const Tout* out_data = out.data<Tout>();
-  const Tout* dout_data = dout.data<Tout>();
-  T* dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
-  T* dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
-  if (dx_data != nullptr) {
-    memset(dx_data, 0, dx->numel() * sizeof(T));
-  }
-  if (dy_data != nullptr) {
-    memset(dy_data, 0, dy->numel() * sizeof(T));
-  }
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (dx_data != nullptr) {
-      dx_data[x_index] += dx_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-    if (dy_data != nullptr) {
-      dy_data[y_index] += dy_op(x_data[x_index],
-                                y_data[y_index],
-                                out_data[out_index],
-                                dout_data[out_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonForwardBroadcastCPU(const DenseTensor& x,
-                               const DenseTensor& y,
-                               DenseTensor* z,
-                               int* x_dims_array,
-                               int* y_dims_array,
-                               int* out_dims_array,
-                               int max_dim,
-                               const CPUContext& ctx,
-                               Functor func,
-                               const bool is_xsize_larger = true) {
-  std::vector<int> index_array(max_dim, 0);
-  const T* x_data = x.data<T>();
-  const T* y_data = y.data<T>();
-  PADDLE_ENFORCE_NOT_NULL(
-      x_data, phi::errors::InvalidArgument("The input X should not be empty."));
-  PADDLE_ENFORCE_NOT_NULL(
-      y_data, phi::errors::InvalidArgument("The input Y should not be empty."));
-  OutType* out_data = ctx.Alloc<OutType>(z);
-
-  const int out_size = std::accumulate(
-      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
-  int x_index, y_index;
-  for (int out_index = 0; out_index < out_size; ++out_index) {
-    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
-    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
-    if (is_xsize_larger) {
-      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
-    } else {
-      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
-    }
-
-    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
-  }
-}
-
-template <typename Functor, typename T, typename OutType = T>
-void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx,
-                                       const DenseTensor& x,
-                                       const DenseTensor& y,
-                                       DenseTensor* z,
-                                       const DDim& x_dims,
-                                       const DDim& y_dims,
-                                       Functor func,
-                                       int axis,
-                                       const bool is_xsize_larger = true) {
-  int max_dim = (std::max)(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-
-  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
-                                                 y,
-                                                 z,
-                                                 x_dims_array.data(),
-                                                 y_dims_array.data(),
-                                                 out_dims_array.data(),
-                                                 max_dim,
-                                                 dev_ctx,
-                                                 func,
-                                                 is_xsize_larger);
-}
-
-// It is a common CPU implementation to compute binary calculation with the
-// support of broadcast. Note:
-// 1. CPU implementation cannot support the case when x needs broadcast, thus
-//    this function need to be called with XxxFunctor and XxxInverseFunctor,
-//    like AddFunctor and InverseAddFunctor.
-// 2. The corresponding GPU implementation supports all the broadcast cases,
-//    thus there is no need to define and call with XxxInverseFunctor.
-// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
-// cases and avoid the need of XxxInverseFunctor.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const CPUContext& dev_ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        int axis,
-                        Functor func,
-                        DenseTensor* z) {
-  dev_ctx.Alloc<OutType>(z);
-  auto x_dims = x.dims();
-  auto y_dims = y.dims();
-  bool is_xsize_larger = true;
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-  funcs::TransformFunctor<Functor, T, CPUContext, OutType> functor(
-      x, y, z, dev_ctx, func, is_xsize_larger);
-  if (x_dims == y_dims) {
-    functor.Run();
-    return;
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common implementation.
-  // case 1: x=[2,3,1,5], y=[2,1,4,1]
-  // case 2: x=[2,3,4], y=[1,1,4]
-  if (is_run_common_broadcast == 1) {
-    CommonElementwiseBroadcastForward<Functor, T, OutType>(
-        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
-    return;
-  }
-
-  if (post == 1) {
-    functor.RunRowWise(n, pre);
-    return;
-  } else {
-    functor.RunMidWise(n, pre, post);
-    return;
-  }
-}
-
 template <typename Functor>
 struct SameDimsElementwiseCompute {
   void operator()(const CPUContext& dev_ctx,
@@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute {
   }
 };
 
-// BACKWARD CODE
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast1CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int h,
-                                      int w,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int x_offset = i * w + j;
-        if (dx != nullptr) {
-          dx[x_offset] =
-              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-        }
-        if (dy != nullptr) {
-          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          if (i == 0) {
-            dy[j] = tmp;
-          } else {
-            dy[j] += tmp;
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int y_offset = i * w + j;
-        if (dy != nullptr) {
-          dy[y_offset] =
-              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-        }
-        if (dx != nullptr) {
-          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          if (i == 0) {
-            dx[j] = tmp;
-          } else {
-            dx[j] += tmp;
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-static void ElemwiseGradBroadcast2CPU(const T* x,
-                                      const T* y,
-                                      const Tout* out,
-                                      const Tout* dout,
-                                      int pre,
-                                      int n,
-                                      int post,
-                                      bool is_xsize_larger,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op,
-                                      T* dx,
-                                      T* dy) {
-  if (is_xsize_larger) {
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int x_offset = i * n * post + j * post + k;
-          if (dx != nullptr) {
-            dx[x_offset] =
-                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-          }
-          if (dy != nullptr) {
-            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
-            if (i == 0 && k == 0) {
-              dy[j] = tmp;
-            } else {
-              dy[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  } else {  // x.dims < y.dims, broadcast for x.
-    for (int i = 0; i < pre; ++i) {
-      for (int j = 0; j < n; ++j) {
-        for (int k = 0; k < post; ++k) {
-          int y_offset = i * n * post + j * post + k;
-          if (dy != nullptr) {
-            dy[y_offset] =
-                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-          }
-          if (dx != nullptr) {
-            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
-            if (i == 0 && k == 0) {
-              dx[j] = tmp;
-            } else {
-              dx[j] += tmp;
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
-                                        const DDim& x_dims,
-                                        const DDim& y_dims,
-                                        const DenseTensor& x,
-                                        const DenseTensor& y,
-                                        const DenseTensor& out,
-                                        const DenseTensor& dout,
-                                        int axis,
-                                        DenseTensor* dx,
-                                        DenseTensor* dy,
-                                        DX_OP dx_op,
-                                        DY_OP dy_op) {
-  int max_dim = std::max(x_dims.size(), y_dims.size());
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> y_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
-  // for inplace strategy. memset will make dx and dout clear and get wrong
-  // result.
-  if (dx && dx->IsSharedBufferWith(dout)) {
-    dx->clear();
-    dx->mutable_data<T>(x_dims, ctx.GetPlace());
-  }
-
-  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << phi::make_ddim(x_dims_array)
-          << " ydim:" << phi::make_ddim(y_dims_array);
-
-  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
-                                                y,
-                                                out,
-                                                dout,
-                                                dx,
-                                                dy,
-                                                x_dims_array.data(),
-                                                y_dims_array.data(),
-                                                out_dims_array.data(),
-                                                max_dim,
-                                                ctx,
-                                                dx_op,
-                                                dy_op);
-}
-
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx,
-                                      const DDim& x_dims,
-                                      const DDim& y_dims,
-                                      const DenseTensor& x,
-                                      const DenseTensor& y,
-                                      const DenseTensor& out,
-                                      const DenseTensor& dout,
-                                      int axis,
-                                      DenseTensor* dx,
-                                      DenseTensor* dy,
-                                      DX_OP dx_op,
-                                      DY_OP dy_op) {
-  bool is_xsize_larger = true;
-
-  int max_dim = x_dims.size();
-  if (x_dims.size() < y_dims.size()) {
-    is_xsize_larger = false;
-    max_dim = y_dims.size();
-  }
-
-  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-  PADDLE_ENFORCE_GE(
-      axis,
-      0,
-      phi::errors::InvalidArgument(
-          "Axis should be great than or equal to 0, but received axis is %d.",
-          axis));
-  PADDLE_ENFORCE_LT(axis,
-                    max_dim,
-                    phi::errors::InvalidArgument(
-                        "Axis should be less than %d, but received axis is %d.",
-                        max_dim,
-                        axis));
-
-  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
-  if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
-    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
-    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
-  }
-  // special case for common backward implementation.
-  if (is_run_common_broadcast) {
-    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
-        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
-    return;
-  }
-  if (post == 1) {
-    ElemwiseGradBroadcast1CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  } else {
-    ElemwiseGradBroadcast2CPU(x.data<T>(),
-                              y.data<T>(),
-                              out.data<Tout>(),
-                              dout.data<Tout>(),
-                              pre,
-                              n,
-                              post,
-                              is_xsize_larger,
-                              dx_op,
-                              dy_op,
-                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
-                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
-  }
-}
-
-// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
-// explicit gradient can cut off X, Y, Out from gradient op
-// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
-// elementwise code.
-template <typename T, typename DX_OP, typename DY_OP>
-void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 const DenseTensor& out,
-                                 const DenseTensor& dout,
-                                 int axis,
-                                 DenseTensor* dx,
-                                 DenseTensor* dy,
-                                 DX_OP dx_op,
-                                 DY_OP dy_op) {
-  const DDim& x_dim = x.dims();
-  const DDim& y_dim = y.dims();
-  if (x.dims() == y.dims()) {
-    phi::funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(
-        dev_ctx,
-        x_dim,
-        y_dim,
-        dout,
-        dout,
-        out,
-        dout,
-        axis,
-        dx,
-        dy,
-        dx_op,
-        dy_op);
-  } else {
-    ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
-                                                      x_dim,
-                                                      y_dim,
-                                                      dout,
-                                                      dout,
-                                                      out,
-                                                      dout,
-                                                      axis,
-                                                      dx,
-                                                      dy,
-                                                      dx_op,
-                                                      dy_op);
-  }
-}
-
-/*
-******************************
-    Add Grad
-******************************
-*/
-template <typename T>
-struct IdentityGrad {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
-  if (dx) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
-  }
-
-  if (dy) {
-    blas.VCOPY(
-        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
-  }
-}
-
-template <typename T>
-typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_add_grad(const CPUContext& ctx,
-                     const DenseTensor& x,
-                     const DenseTensor& y,
-                     const DenseTensor& out,
-                     const DenseTensor& dout,
-                     DenseTensor* dx,
-                     DenseTensor* dy,
-                     int axis = -1) {
-  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-struct SubGradDX {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
-};
-
-template <typename T>
-struct SubGradDY {
-  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
-};
-
-template <typename T>
-void elementwise_sub_grad(const CPUContext& ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          const DenseTensor& out,
-                          const DenseTensor& dout,
-                          DenseTensor* dx,
-                          DenseTensor* dy,
-                          int axis = -1) {
-  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
-      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
-}
-
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h
new file mode 100644
index 0000000000000..92587566eb875
--- /dev/null
+++ b/paddle/phi/kernels/cpu/elementwise_grad.h
@@ -0,0 +1,146 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+
+namespace phi {
+
+// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub.
+// explicit gradient can cut off X, Y, Out from gradient op
+// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse
+// elementwise code.
+template <typename T, typename DX_OP, typename DY_OP>
+void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx,
+                                 const DenseTensor& x,
+                                 const DenseTensor& y,
+                                 const DenseTensor& out,
+                                 const DenseTensor& dout,
+                                 int axis,
+                                 DenseTensor* dx,
+                                 DenseTensor* dy,
+                                 DX_OP dx_op,
+                                 DY_OP dy_op) {
+  const DDim& x_dim = x.dims();
+  const DDim& y_dim = y.dims();
+  if (x.dims() == y.dims()) {
+    funcs::ElemwiseGradComputeNoBroadcast<CPUContext, T, DX_OP, DY_OP>(dev_ctx,
+                                                                       x_dim,
+                                                                       y_dim,
+                                                                       dout,
+                                                                       dout,
+                                                                       out,
+                                                                       dout,
+                                                                       axis,
+                                                                       dx,
+                                                                       dy,
+                                                                       dx_op,
+                                                                       dy_op);
+  } else {
+    funcs::ElemwiseGradComputeWithBroadcast<T, DX_OP, DY_OP>(dev_ctx,
+                                                             x_dim,
+                                                             y_dim,
+                                                             dout,
+                                                             dout,
+                                                             out,
+                                                             dout,
+                                                             axis,
+                                                             dx,
+                                                             dy,
+                                                             dx_op,
+                                                             dy_op);
+  }
+}
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+template <typename T>
+struct IdentityGrad {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  auto blas = phi::funcs::GetBlas<CPUContext, T>(ctx);
+  if (dx) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dx->mutable_data<T>(ctx.GetPlace()));
+  }
+
+  if (dy) {
+    blas.VCOPY(
+        dout.numel(), dout.data<T>(), dy->mutable_data<T>(ctx.GetPlace()));
+  }
+}
+
+template <typename T>
+typename std::enable_if<!std::is_floating_point<T>::value>::type
+ElementwiseAddGrad(const CPUContext& ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   DenseTensor* dx,
+                   DenseTensor* dy,
+                   int axis = -1) {
+  ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, IdentityGrad<T>(), IdentityGrad<T>());
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+struct SubGradDX {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
+};
+
+template <typename T>
+struct SubGradDY {
+  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
+};
+
+template <typename T>
+void ElementwiseSubGrad(const CPUContext& ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& y,
+                        const DenseTensor& out,
+                        const DenseTensor& dout,
+                        DenseTensor* dx,
+                        DenseTensor* dy,
+                        int axis = -1) {
+  ElemwiseExplicitGradCompute<T, SubGradDX<T>, SubGradDY<T>>(
+      ctx, x, y, out, dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index c878e8133ffc0..e48ee80595908 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -17,7 +17,8 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/cpu/elementwise_grad.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
@@ -33,7 +34,7 @@ void AddGradFunc(const CPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
     ElemwiseExplicitGradCompute<T, IdentityGrad<T>, IdentityGrad<T>>(
         dev_ctx,
@@ -68,15 +69,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -101,7 +94,7 @@ void SubtractGradKernel(const Context& dev_ctx,
                         DenseTensor* dy) {
   // skip out
   auto* out = &dout;
-  elementwise_sub_grad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
+  ElementwiseSubGrad<T>(dev_ctx, x, y, *out, dout, dx, dy, axis);
 }
 
 template <typename T, typename Context>
@@ -112,15 +105,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
index 3d179e1e75f4f..a0747b128e538 100644
--- a/paddle/phi/kernels/cpu/logical_kernel.cc
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/logical_functor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -24,15 +24,15 @@
 
 namespace phi {
 
-#define DEFINE_LOGICAL_BINARY_KERNEL(type)                         \
-  template <typename T, typename Context>                          \
-  void Logical##type##Kernel(const Context& dev_ctx,               \
-                             const DenseTensor& x,                 \
-                             const DenseTensor& y,                 \
-                             DenseTensor* out) {                   \
-    funcs::Logical##type##Functor<T> binary_func;                  \
-    ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
-        dev_ctx, x, y, -1, binary_func, out);                      \
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                                \
+  template <typename T, typename Context>                                 \
+  void Logical##type##Kernel(const Context& dev_ctx,                      \
+                             const DenseTensor& x,                        \
+                             const DenseTensor& y,                        \
+                             DenseTensor* out) {                          \
+    funcs::Logical##type##Functor<T> binary_func;                         \
+    funcs::ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                             \
   }
 
 DEFINE_LOGICAL_BINARY_KERNEL(And)
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 5cfcfe62c7816..250f656926c05 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -20,6 +20,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/reduce_functor.h"
 
@@ -45,10 +46,10 @@ namespace phi {
       auto x_dims = x.dims();                                               \
       auto y_dims = y.dims();                                               \
       if (x_dims.size() >= y_dims.size()) {                                 \
-        ElementwiseCompute<funcs::name##Functor<T>, T>(                     \
+        funcs::ElementwiseCompute<funcs::name##Functor<T>, T>(              \
             dev_ctx, x, y, axis, funcs::name##Functor<T>(), out);           \
       } else {                                                              \
-        ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(            \
+        funcs::ElementwiseCompute<funcs::Inverse##name##Functor<T>, T>(     \
             dev_ctx, x, y, axis, funcs::Inverse##name##Functor<T>(), out);  \
       }                                                                     \
     }                                                                       \
@@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx,
     auto x_dims = x.dims();
     auto y_dims = y.dims();
     if (x_dims.size() >= y_dims.size()) {
-      ElementwiseCompute<funcs::DivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::DivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::DivideFunctor<T>(), out);
     } else {
-      ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
+      funcs::ElementwiseCompute<funcs::InverseDivideFunctor<T>, T>(
           dev_ctx, x, y, axis, funcs::InverseDivideFunctor<T>(), out);
     }
   }
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 84a36b849afa1..e9fd4cf47b834 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -25,6 +25,8 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+
 struct DimensionsTransform {
   using DimVector = std::vector<int64_t>;
   typedef void (*MergeFunctor)(
@@ -183,8 +185,6 @@ struct DimensionsTransform {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
-
 template <typename T, int VecSize, int Rank, bool IsBoundary = false>
 __device__ __forceinline__ void LoadData(
     T *dst,
@@ -578,6 +578,20 @@ void BroadcastKernel(const KPDevice &ctx,
   }
 }
 
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const GPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  std::vector<const DenseTensor *> ins = {&x, &y};
+  std::vector<DenseTensor *> outs = {z};
+  z->mutable_data<OutType>(dev_ctx.GetPlace());
+  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
+      dev_ctx, ins, &outs, axis, func);
+}
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index d369781f845eb..235dbdd40f6b7 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
@@ -44,28 +45,6 @@ using ConditionalT =
 namespace funcs {
 using DDim = phi::DDim;
 
-template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
-struct ElemwiseGradNoBroadcast {
-  const T *x_;
-  const T *y_;
-  const Tout *out_;
-  const Tout *dout_;
-
-  HOSTDEVICE void operator()(size_t i) {
-    if (dx_ != nullptr) {
-      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-    if (dy_ != nullptr) {
-      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
-    }
-  }
-
-  DX_OP dx_op_;
-  DY_OP dy_op_;
-  T *dx_;
-  T *dy_;
-};
-
 template <typename T, typename DeviceContext>
 class RowwiseTransformIterator;
 
@@ -293,73 +272,172 @@ class TransformFunctor {
   bool is_xsize_larger_;
 };
 
-inline DDim trim_trailing_singular_dims(const DDim &dims) {
-  // Remove trailing dimensions of size 1 for y
-  auto actual_dims_size = dims.size();
-  for (; actual_dims_size != 0; --actual_dims_size) {
-    if (dims[actual_dims_size - 1] != 1) break;
-  }
-  if (actual_dims_size == dims.size()) return dims;
-  std::vector<int> trim_dims;
-  trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
-    trim_dims[i] = dims[i];
-  }
-  if (trim_dims.size() == 0) {
-    return DDim(phi::make_dim());
+template <typename Functor, typename T, typename OutType = T>
+void CommonForwardBroadcastCPU(const DenseTensor &x,
+                               const DenseTensor &y,
+                               DenseTensor *z,
+                               int *x_dims_array,
+                               int *y_dims_array,
+                               int *out_dims_array,
+                               int max_dim,
+                               const CPUContext &ctx,
+                               Functor func,
+                               const bool is_xsize_larger = true) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  PADDLE_ENFORCE_NOT_NULL(
+      x_data, errors::InvalidArgument("The input X should not be empty."));
+  PADDLE_ENFORCE_NOT_NULL(
+      y_data, errors::InvalidArgument("The input Y should not be empty."));
+  OutType *out_data = ctx.Alloc<OutType>(z);
+
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (is_xsize_larger) {
+      out_data[out_index] = func(x_data[x_index], y_data[y_index]);
+    } else {
+      out_data[out_index] = func(y_data[y_index], x_data[x_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
   }
-  DDim actual_dims = phi::make_ddim(trim_dims);
-  return actual_dims;
 }
 
-/*
- * Out = X ⊙ Y
- * If Y's shape does not match X' shape, they will be reshaped.
- * For example:
- * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
- *    pre=2, n=3*4, post=5
- *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
- * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
- *    pre=2*3, n=4*5, post=1
- *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
- *
- * New parameter: *is_run_common_broadcast* is a flag to record whether to run
- * common broadcast code.
- */
-inline void get_mid_dims(const DDim &x_dims,
-                         const DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post,
-                         int *is_run_common_broadcast) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  *is_run_common_broadcast = 0;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    if (x_dims[i + axis] != y_dims[i]) {
-      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
-                        true,
-                        phi::errors::InvalidArgument(
-                            "Broadcast dimension mismatch. Operands "
-                            "could not be broadcast together with the shape of "
-                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
-                            "in X is not equal to [%d] in Y.",
-                            x_dims,
-                            y_dims,
-                            x_dims[i + axis],
-                            y_dims[i]));
-      *is_run_common_broadcast = 1;
-      return;
-    }
-    (*n) *= y_dims[i];
-  }
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
+template <typename Functor, typename T, typename OutType = T>
+void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx,
+                                       const DenseTensor &x,
+                                       const DenseTensor &y,
+                                       DenseTensor *z,
+                                       const DDim &x_dims,
+                                       const DDim &y_dims,
+                                       Functor func,
+                                       int axis,
+                                       const bool is_xsize_larger = true) {
+  int max_dim = (std::max)(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      phi::errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    phi::errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+
+  CommonForwardBroadcastCPU<Functor, T, OutType>(x,
+                                                 y,
+                                                 z,
+                                                 x_dims_array.data(),
+                                                 y_dims_array.data(),
+                                                 out_dims_array.data(),
+                                                 max_dim,
+                                                 dev_ctx,
+                                                 func,
+                                                 is_xsize_larger);
+}
+
+// It is a common CPU implementation to compute binary calculation with the
+// support of broadcast. Note:
+// 1. CPU implementation cannot support the case when x needs broadcast, thus
+//    this function need to be called with XxxFunctor and XxxInverseFunctor,
+//    like AddFunctor and InverseAddFunctor.
+// 2. The corresponding GPU implementation supports all the broadcast cases,
+//    thus there is no need to define and call with XxxInverseFunctor.
+// TODO(liuyiqun): optimize the CPU implementation to support all broadcast
+// cases and avoid the need of XxxInverseFunctor.
+template <typename Functor, typename T, typename OutType = T>
+void ElementwiseCompute(const CPUContext &dev_ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        int axis,
+                        Functor func,
+                        DenseTensor *z) {
+  dev_ctx.Alloc<OutType>(z);
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  bool is_xsize_larger = true;
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+  TransformFunctor<Functor, T, CPUContext, OutType> functor(
+      x, y, z, dev_ctx, func, is_xsize_larger);
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common implementation.
+  // case 1: x=[2,3,1,5], y=[2,1,4,1]
+  // case 2: x=[2,3,4], y=[1,1,4]
+  if (is_run_common_broadcast == 1) {
+    CommonElementwiseBroadcastForward<Functor, T, OutType>(
+        dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger);
+    return;
+  }
+
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
   }
 }
 
@@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
     auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = phi::Empty(dev_ctx, std::move(meta));
     ddx_safe->mutable_data(dev_ctx.GetPlace());
-    phi::funcs::SetConstant<DeviceContext, T> set_zero;
+    SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
 }
 
-template <typename DeviceContext,
-          typename T,
-          typename DX_OP,
-          typename DY_OP,
-          typename Tout = T>
-void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
-                                    const DDim &x_dim,
-                                    const DDim &y_dim,
-                                    const DenseTensor &x,
-                                    const DenseTensor &y,
-                                    const DenseTensor &out,
-                                    const DenseTensor &dout,
-                                    int axis,
-                                    DenseTensor *dx,
-                                    DenseTensor *dy,
-                                    DX_OP dx_op,
-                                    DY_OP dy_op) {
-  size_t N = static_cast<size_t>(phi::product(x_dim));
-  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
-  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
-      x.data<T>(),
-      y.data<T>(),
-      out.data<Tout>(),
-      dout.data<Tout>(),
-      dx_op,
-      dy_op,
-      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
-      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
-}
-
 inline void ElementwiseGradPreProcess(const DenseTensor &dout,
                                       DenseTensor *dx) {
   if (dx != nullptr) {
@@ -806,6 +854,7 @@ void ElementwiseKernel(const KPDevice &ctx,
     }
   }
 }
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
similarity index 78%
rename from paddle/phi/kernels/gpu/elementwise.h
rename to paddle/phi/kernels/funcs/elementwise_grad_base.h
index 12cafc7023bb5..dff0cfe5b8b90 100644
--- a/paddle/phi/kernels/gpu/elementwise.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -14,16 +14,25 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/gpu/reduce.h"
+#include "paddle/phi/kernels/funcs/elementwise_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+
+#endif
 
 #ifdef __HIPCC__
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 256;
 #else
 constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 #endif
+
 #define BLOCK_X 32
 #define BLOCK_Y 32
 
@@ -36,21 +45,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 
 namespace phi {
 
-// General binary elementwise comutaion with the support of broadcast.
-template <typename Functor, typename T, typename OutType = T>
-void ElementwiseCompute(const GPUContext &dev_ctx,
-                        const DenseTensor &x,
-                        const DenseTensor &y,
-                        int axis,
-                        Functor func,
-                        DenseTensor *z) {
-  std::vector<const DenseTensor *> ins = {&x, &y};
-  std::vector<DenseTensor *> outs = {z};
-  z->mutable_data<OutType>(dev_ctx.GetPlace());
-  phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, OutType>(
-      dev_ctx, ins, &outs, axis, func);
+namespace funcs {
+using DDim = phi::DDim;
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonGradBroadcastCPU(const DenseTensor &x,
+                            const DenseTensor &y,
+                            const DenseTensor &out,
+                            const DenseTensor &dout,
+                            DenseTensor *dx,
+                            DenseTensor *dy,
+                            int *x_dims_array,
+                            int *y_dims_array,
+                            int *out_dims_array,
+                            int max_dim,
+                            const CPUContext &ctx,
+                            DX_OP dx_op,
+                            DY_OP dy_op) {
+  std::vector<int> index_array(max_dim, 0);
+  const T *x_data = x.data<T>();
+  const T *y_data = y.data<T>();
+  const Tout *out_data = out.data<Tout>();
+  const Tout *dout_data = dout.data<Tout>();
+  T *dx_data = dx == nullptr ? nullptr : ctx.Alloc<T>(dx);
+  T *dy_data = dy == nullptr ? nullptr : ctx.Alloc<T>(dy);
+  if (dx_data != nullptr) {
+    memset(dx_data, 0, dx->numel() * sizeof(T));
+  }
+  if (dy_data != nullptr) {
+    memset(dy_data, 0, dy->numel() * sizeof(T));
+  }
+  const int out_size = std::accumulate(
+      out_dims_array, out_dims_array + max_dim, 1, std::multiplies<int>());
+  int x_index, y_index;
+  for (int out_index = 0; out_index < out_size; ++out_index) {
+    x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data());
+    y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data());
+    if (dx_data != nullptr) {
+      dx_data[x_index] += dx_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+    if (dy_data != nullptr) {
+      dy_data[y_index] += dy_op(x_data[x_index],
+                                y_data[y_index],
+                                out_data[out_index],
+                                dout_data[out_index]);
+    }
+
+    UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data());
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast1CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int h,
+                                      int w,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int x_offset = i * w + j;
+        if (dx != nullptr) {
+          dx[x_offset] =
+              dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+        }
+        if (dy != nullptr) {
+          T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          if (i == 0) {
+            dy[j] = tmp;
+          } else {
+            dy[j] += tmp;
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int y_offset = i * w + j;
+        if (dy != nullptr) {
+          dy[y_offset] =
+              dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+        }
+        if (dx != nullptr) {
+          T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          if (i == 0) {
+            dx[j] = tmp;
+          } else {
+            dx[j] += tmp;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+static void ElemwiseGradBroadcast2CPU(const T *x,
+                                      const T *y,
+                                      const Tout *out,
+                                      const Tout *dout,
+                                      int pre,
+                                      int n,
+                                      int post,
+                                      bool is_xsize_larger,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op,
+                                      T *dx,
+                                      T *dy) {
+  if (is_xsize_larger) {
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int x_offset = i * n * post + j * post + k;
+          if (dx != nullptr) {
+            dx[x_offset] =
+                dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+          }
+          if (dy != nullptr) {
+            T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]);
+            if (i == 0 && k == 0) {
+              dy[j] = tmp;
+            } else {
+              dy[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  } else {  // x.dims < y.dims, broadcast for x.
+    for (int i = 0; i < pre; ++i) {
+      for (int j = 0; j < n; ++j) {
+        for (int k = 0; k < post; ++k) {
+          int y_offset = i * n * post + j * post + k;
+          if (dy != nullptr) {
+            dy[y_offset] =
+                dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+          }
+          if (dx != nullptr) {
+            T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]);
+            if (i == 0 && k == 0) {
+              dx[j] = tmp;
+            } else {
+              dx[j] += tmp;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void CommonElementwiseBroadcastBackward(const CPUContext &ctx,
+                                        const DDim &x_dims,
+                                        const DDim &y_dims,
+                                        const DenseTensor &x,
+                                        const DenseTensor &y,
+                                        const DenseTensor &out,
+                                        const DenseTensor &dout,
+                                        int axis,
+                                        DenseTensor *dx,
+                                        DenseTensor *dy,
+                                        DX_OP dx_op,
+                                        DY_OP dy_op) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
+  // for inplace strategy. memset will make dx and dout clear and get wrong
+  // result.
+  if (dx && dx->IsSharedBufferWith(dout)) {
+    dx->clear();
+    dx->mutable_data<T>(x_dims, ctx.GetPlace());
+  }
+
+  VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
+          << phi::make_ddim(x_dims_array)
+          << " ydim:" << phi::make_ddim(y_dims_array);
+
+  CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
+                                                y,
+                                                out,
+                                                dout,
+                                                dx,
+                                                dy,
+                                                x_dims_array.data(),
+                                                y_dims_array.data(),
+                                                out_dims_array.data(),
+                                                max_dim,
+                                                ctx,
+                                                dx_op,
+                                                dy_op);
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx,
+                                      const DDim &x_dims,
+                                      const DDim &y_dims,
+                                      const DenseTensor &x,
+                                      const DenseTensor &y,
+                                      const DenseTensor &out,
+                                      const DenseTensor &dout,
+                                      int axis,
+                                      DenseTensor *dx,
+                                      DenseTensor *dy,
+                                      DX_OP dx_op,
+                                      DY_OP dy_op) {
+  bool is_xsize_larger = true;
+
+  int max_dim = x_dims.size();
+  if (x_dims.size() < y_dims.size()) {
+    is_xsize_larger = false;
+    max_dim = y_dims.size();
+  }
+
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  PADDLE_ENFORCE_GE(
+      axis,
+      0,
+      errors::InvalidArgument(
+          "Axis should be great than or equal to 0, but received axis is %d.",
+          axis));
+  PADDLE_ENFORCE_LT(axis,
+                    max_dim,
+                    errors::InvalidArgument(
+                        "Axis should be less than %d, but received axis is %d.",
+                        max_dim,
+                        axis));
+
+  int pre, n, post, is_run_common_broadcast, axis_trim = 0;
+  if (is_xsize_larger) {
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
+    axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  } else {
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
+    axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
+  }
+  // special case for common backward implementation.
+  if (is_run_common_broadcast) {
+    CommonElementwiseBroadcastBackward<T, DX_OP, DY_OP, Tout>(
+        ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op);
+    return;
+  }
+  if (post == 1) {
+    ElemwiseGradBroadcast1CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  } else {
+    ElemwiseGradBroadcast2CPU(x.data<T>(),
+                              y.data<T>(),
+                              out.data<Tout>(),
+                              dout.data<Tout>(),
+                              pre,
+                              n,
+                              post,
+                              is_xsize_larger,
+                              dx_op,
+                              dy_op,
+                              dx == nullptr ? nullptr : ctx.Alloc<T>(dx),
+                              dy == nullptr ? nullptr : ctx.Alloc<T>(dy));
+  }
+}
+
+template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
+struct ElemwiseGradNoBroadcast {
+  const T *x_;
+  const T *y_;
+  const Tout *out_;
+  const Tout *dout_;
+
+  HOSTDEVICE void operator()(size_t i) {
+    if (dx_ != nullptr) {
+      dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+    if (dy_ != nullptr) {
+      dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
+    }
+  }
+
+  DX_OP dx_op_;
+  DY_OP dy_op_;
+  T *dx_;
+  T *dy_;
+};
+
+template <typename DeviceContext,
+          typename T,
+          typename DX_OP,
+          typename DY_OP,
+          typename Tout = T>
+void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
+                                    const DDim &x_dim,
+                                    const DDim &y_dim,
+                                    const DenseTensor &x,
+                                    const DenseTensor &y,
+                                    const DenseTensor &out,
+                                    const DenseTensor &dout,
+                                    int axis,
+                                    DenseTensor *dx,
+                                    DenseTensor *dy,
+                                    DX_OP dx_op,
+                                    DY_OP dy_op) {
+  size_t N = static_cast<size_t>(phi::product(x_dim));
+  phi::funcs::ForRange<DeviceContext> for_range(dev_ctx, N);
+  for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
+      x.data<T>(),
+      y.data<T>(),
+      out.data<Tout>(),
+      dout.data<Tout>(),
+      dx_op,
+      dy_op,
+      dx == nullptr ? nullptr : dev_ctx.template Alloc<T>(dx),
+      dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
+#if defined(__NVCC__) || defined(__HIPCC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
@@ -114,7 +463,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array,
   }
 }
 
-#ifndef __xpu__
 template <typename T, typename OP, typename Tout = T>
 static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x,
                                                             const T *y,
@@ -1282,13 +1630,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   std::vector<int> x_dims_array(max_dim);
   std::vector<int> y_dims_array(max_dim);
   std::vector<int> out_dims_array(max_dim);
-  funcs::GetBroadcastDimsArrays(x_dims,
-                                y_dims,
-                                x_dims_array.data(),
-                                y_dims_array.data(),
-                                out_dims_array.data(),
-                                max_dim,
-                                axis);
+  GetBroadcastDimsArrays(x_dims,
+                         y_dims,
+                         x_dims_array.data(),
+                         y_dims_array.data(),
+                         out_dims_array.data(),
+                         max_dim,
+                         axis);
   // for inplace strategy. memset will make dx and dout clear and get wrong
   // result.
   if (dx && dx->IsSharedBufferWith(dout)) {
@@ -1340,37 +1688,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   PADDLE_ENFORCE_GE(
       axis,
       0,
-      phi::errors::InvalidArgument(
+      errors::InvalidArgument(
           "Axis should be great than or equal to 0, but received axis is %d.",
           axis));
   PADDLE_ENFORCE_LT(axis,
                     max_dim,
-                    phi::errors::InvalidArgument(
+                    errors::InvalidArgument(
                         "Axis should be less than %d, but received axis is %d.",
                         max_dim,
                         axis));
 
   int pre, n, post, is_run_common_broadcast, axis_trim = 0;
   if (is_xsize_larger) {
-    auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims);
+    auto y_dims_trimed = TrimTrailingSingularDims(y_dims);
     axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis;
-    funcs::get_mid_dims(x_dims,
-                        y_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(x_dims,
+               y_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   } else {
-    auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims);
+    auto x_dims_trimed = TrimTrailingSingularDims(x_dims);
     axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis;
-    funcs::get_mid_dims(y_dims,
-                        x_dims_trimed,
-                        axis_trim,
-                        &pre,
-                        &n,
-                        &post,
-                        &is_run_common_broadcast);
+    GetMidDims(y_dims,
+               x_dims_trimed,
+               axis_trim,
+               &pre,
+               &n,
+               &post,
+               &is_run_common_broadcast);
   }
   // special case for common backward implementation.
   if (is_run_common_broadcast) {
@@ -1408,228 +1756,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx,
   }
 }
 
-/*
-******************************
-    Add Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(
-    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
-  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-  int stride = GRID_NUM_X * BLOCK_NUM_X;
-  int loop = size / vec_size;
-  int remainder = size % vec_size;
-  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
-  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
-  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
-  float4 tmp_loop;
-
-  for (int i = tid; i < loop; i += stride) {
-    tmp_loop = dout_vec[i];
-    dx_vec[i] = tmp_loop;
-    dy_vec[i] = tmp_loop;
-  }
-
-  if (tid == loop && remainder != 0) {
-    T tmp_rem;
-    while (remainder) {
-      int idx = size - remainder;
-      remainder--;
-      tmp_rem = dout[idx];
-      dx[idx] = tmp_rem;
-      dy[idx] = tmp_rem;
-    }
-  }
-}
-
-template <typename T>
-void default_elementwise_add_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename T>
-void elementwise_add_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  auto *dout_data = dout.data<T>();
-  if (dx_data == dout_data && dy_data != dout_data) {
-    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               "only need copy dout to dy";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
-  } else if (dx_data != dout_data && dy_data == dout_data) {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "only need copy dout to dx";
-    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-  } else if (dx_data != dout_data && dy_data != dout_data) {
-    auto size = x.numel();
-    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
-                 PREDEFINED_BLOCK_SIZE,
-             1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0, ctx.stream()>>>(
-        dout.data<T>(),
-        size,
-        vec_size,
-        dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
-  } else {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "and dx_data is the same as dout_data, do not need "
-               "any operator";
-  }
-}
-
-/*
-******************************
-    Sub Grad
-******************************
-*/
-
-template <typename T>
-static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
-                                                       int64_t size,
-                                                       T *dx,
-                                                       T *dy) {
-  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
-
-  while (col < size) {
-    if (dx != nullptr) {
-      dx[col] = dout[col];
-    }
-    dy[col] = -dout[col];
-    col += BLOCK_NUM_X * GRID_NUM_X;
-  }
-}
-
-template <typename T>
-void default_elementwise_sub_grad(const GPUContext &ctx,
-                                  const DenseTensor &x,
-                                  const DenseTensor &y,
-                                  const DenseTensor &out,
-                                  const DenseTensor &dout,
-                                  DenseTensor *dx,
-                                  DenseTensor *dy,
-                                  int axis = -1) {
-  auto *dout_data = dout.data<T>();
-  // dx
-  if (dx != nullptr) {
-    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout.dims()) {
-      if (dx_data != dout_data) {
-        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(x.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout.dims()) {
-      if (dy_data != dout_data) {
-        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-        auto size = dy->numel();
-        dim3 grid_size =
-            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-        SimpleElemwiseSubGradCUDAKernel<
-            T><<<grid_size, block_size, 0, ctx.stream()>>>(
-            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
-      }
-    } else {
-      std::vector<int> reduce_dims =
-          funcs::GetReduceDim(y.dims(), out.dims(), axis);
-      gpuStream_t stream = ctx.stream();
-      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
-          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename T>
-void elementwise_sub_grad(const GPUContext &ctx,
-                          const DenseTensor &x,
-                          const DenseTensor &y,
-                          const DenseTensor &out,
-                          const DenseTensor &dout,
-                          DenseTensor *dx,
-                          DenseTensor *dy) {
-  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-  auto size = x.numel();
-  dim3 grid_size =
-      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
-  SimpleElemwiseSubGradCUDAKernel<
-      T><<<grid_size, block_size, 0, ctx.stream()>>>(
-      dout.data<T>(),
-      size,
-      dx->mutable_data<T>(ctx.GetPlace()),
-      dy->mutable_data<T>(ctx.GetPlace()));
-}
-
 #endif
 
+}  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h
new file mode 100644
index 0000000000000..3790044346dc4
--- /dev/null
+++ b/paddle/phi/kernels/funcs/elementwise_utils.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+
+namespace funcs {
+
+using DDim = phi::DDim;
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1)
+ *
+ * New parameter: *is_run_common_broadcast* is a flag to record whether to run
+ * common broadcast code.
+ */
+inline void GetMidDims(const DDim &x_dims,
+                       const DDim &y_dims,
+                       const int axis,
+                       int *pre,
+                       int *n,
+                       int *post,
+                       int *is_run_common_broadcast) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  *is_run_common_broadcast = 0;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    if (x_dims[i + axis] != y_dims[i]) {
+      PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1,
+                        true,
+                        phi::errors::InvalidArgument(
+                            "Broadcast dimension mismatch. Operands "
+                            "could not be broadcast together with the shape of "
+                            "X = [%s] and the shape of Y = [%s]. Received [%d] "
+                            "in X is not equal to [%d] in Y.",
+                            x_dims,
+                            y_dims,
+                            x_dims[i + axis],
+                            y_dims[i]));
+      *is_run_common_broadcast = 1;
+      return;
+    }
+    (*n) *= y_dims[i];
+  }
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+inline DDim TrimTrailingSingularDims(const DDim &dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  if (actual_dims_size == dims.size()) return dims;
+  std::vector<int> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim(phi::make_dim());
+  }
+  DDim actual_dims = phi::make_ddim(trim_dims);
+  return actual_dims;
+}
+
+inline int GetElementwiseIndex(const int *x_dims_array,
+                               const int max_dim,
+                               const int *index_array) {
+  int index_ = 0;
+  for (int i = 0; i < max_dim; i++) {
+    if (x_dims_array[i] > 1) {
+      index_ = index_ * x_dims_array[i] + index_array[i];
+    }
+  }
+  return index_;
+}
+
+inline void UpdateElementwiseIndexArray(const int *out_dims_array,
+                                        const int max_dim,
+                                        int *index_array) {
+  for (int i = max_dim - 1; i >= 0; --i) {
+    ++index_array[i];
+    if (index_array[i] >= out_dims_array[i]) {
+      index_array[i] -= out_dims_array[i];
+    } else {
+      break;
+    }
+  }
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h
new file mode 100644
index 0000000000000..b17196b6b1156
--- /dev/null
+++ b/paddle/phi/kernels/gpu/elementwise_grad.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+
+namespace phi {
+
+/*
+******************************
+    Add Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseAddGradCUDAKernel(
+    const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) {
+  int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+  int stride = GRID_NUM_X * BLOCK_NUM_X;
+  int loop = size / vec_size;
+  int remainder = size % vec_size;
+  const float4 *dout_vec = reinterpret_cast<const float4 *>(dout);
+  float4 *dx_vec = reinterpret_cast<float4 *>(dx);
+  float4 *dy_vec = reinterpret_cast<float4 *>(dy);
+  float4 tmp_loop;
+
+  for (int i = tid; i < loop; i += stride) {
+    tmp_loop = dout_vec[i];
+    dx_vec[i] = tmp_loop;
+    dy_vec[i] = tmp_loop;
+  }
+
+  if (tid == loop && remainder != 0) {
+    T tmp_rem;
+    while (remainder) {
+      int idx = size - remainder;
+      remainder--;
+      tmp_rem = dout[idx];
+      dx[idx] = tmp_rem;
+      dy[idx] = tmp_rem;
+    }
+  }
+}
+
+template <typename T>
+void DefaultElementwiseAddGrad(const GPUContext &ctx,
+                               const DenseTensor &x,
+                               const DenseTensor &y,
+                               const DenseTensor &out,
+                               const DenseTensor &dout,
+                               DenseTensor *dx,
+                               DenseTensor *dy,
+                               int axis = -1) {
+  auto *dout_data = dout.data<T>();
+
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+}
+
+template <typename T>
+void ElementwiseAddGrad(const GPUContext &ctx,
+                        const DenseTensor &x,
+                        const DenseTensor &y,
+                        const DenseTensor &out,
+                        const DenseTensor &dout,
+                        DenseTensor *dx,
+                        DenseTensor *dy) {
+  ctx.template Alloc<T>(dx);
+  ctx.template Alloc<T>(dy);
+  auto *dx_data = dx->data<T>();
+  auto *dy_data = dy->data<T>();
+  auto *dout_data = dout.data<T>();
+  if (dx_data == dout_data && dy_data != dout_data) {
+    VLOG(4) << "Special case when dx_data is the same as dout_data, "
+               "only need copy dout to dy";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
+  } else if (dx_data != dout_data && dy_data == dout_data) {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "only need copy dout to dx";
+    phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+  } else if (dx_data != dout_data && dy_data != dout_data) {
+    auto size = x.numel();
+    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
+    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+    dim3 grid_size =
+        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
+                 PREDEFINED_BLOCK_SIZE,
+             1);
+    SimpleElemwiseAddGradCUDAKernel<
+        T><<<grid_size, block_size, 0, ctx.stream()>>>(
+        dout.data<T>(),
+        size,
+        vec_size,
+        dx->mutable_data<T>(ctx.GetPlace()),
+        dy->mutable_data<T>(ctx.GetPlace()));
+  } else {
+    VLOG(4) << "Special case when dy_data is the same as dout_data, "
+               "and dx_data is the same as dout_data, do not need "
+               "any operator";
+  }
+}
+
+/*
+******************************
+    Sub Grad
+******************************
+*/
+
+template <typename T>
+static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout,
+                                                       int64_t size,
+                                                       T *dx,
+                                                       T *dy) {
+  int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X;
+
+  while (col < size) {
+    if (dx != nullptr) {
+      dx[col] = dout[col];
+    }
+    dy[col] = -dout[col];
+    col += BLOCK_NUM_X * GRID_NUM_X;
+  }
+}
+
+template <typename T>
+void default_elementwise_sub_grad(const GPUContext &ctx,
+                                  const DenseTensor &x,
+                                  const DenseTensor &y,
+                                  const DenseTensor &out,
+                                  const DenseTensor &dout,
+                                  DenseTensor *dx,
+                                  DenseTensor *dy,
+                                  int axis = -1) {
+  auto *dout_data = dout.data<T>();
+  // dx
+  if (dx != nullptr) {
+    auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->dims() == dout.dims()) {
+      if (dx_data != dout_data) {
+        phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
+      }
+    } else {
+      // For inplace strategy, dx will be stored in addr of dout, which makes
+      // the result of dy wrong.
+      if (dx->IsSharedBufferWith(dout)) {
+        dx->clear();
+        dx->mutable_data<T>(x.dims(), ctx.GetPlace());
+      }
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(x.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+          ctx, dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
+    }
+  }
+  // dy
+  if (dy != nullptr) {
+    auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    if (dy->dims() == dout.dims()) {
+      if (dy_data != dout_data) {
+        dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+        auto size = dy->numel();
+        dim3 grid_size =
+            dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+        SimpleElemwiseSubGradCUDAKernel<
+            T><<<grid_size, block_size, 0, ctx.stream()>>>(
+            dout.data<T>(), size, nullptr, dy->mutable_data<T>(ctx.GetPlace()));
+      }
+    } else {
+      std::vector<int> reduce_dims =
+          funcs::GetReduceDim(y.dims(), out.dims(), axis);
+      gpuStream_t stream = ctx.stream();
+      kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::InverseFunctor<T>>(
+          ctx, dout, dy, kps::InverseFunctor<T>(), reduce_dims, stream);
+    }
+  }
+}
+
+template <typename T>
+void elementwise_sub_grad(const GPUContext &ctx,
+                          const DenseTensor &x,
+                          const DenseTensor &y,
+                          const DenseTensor &out,
+                          const DenseTensor &dout,
+                          DenseTensor *dx,
+                          DenseTensor *dy) {
+  dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
+  auto size = x.numel();
+  dim3 grid_size =
+      dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1);
+  SimpleElemwiseSubGradCUDAKernel<
+      T><<<grid_size, block_size, 0, ctx.stream()>>>(
+      dout.data<T>(),
+      size,
+      dx->mutable_data<T>(ctx.GetPlace()),
+      dy->mutable_data<T>(ctx.GetPlace()));
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 3c4c01b1dc8ff..d00888aee6701 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -17,8 +17,9 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
+#include "paddle/phi/kernels/gpu/elementwise_grad.h"
 #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h"
 
 namespace phi {
@@ -33,9 +34,9 @@ void AddGradFunc(const GPUContext& dev_ctx,
                  DenseTensor* dy,
                  int axis = -1) {
   if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
-    elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy);
+    ElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy);
   } else {
-    default_elementwise_add_grad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
+    DefaultElementwiseAddGrad<T>(dev_ctx, x, y, out, dout, dx, dy, axis);
   }
 }
 
@@ -58,15 +59,7 @@ void AddDoubleGradKernel(const Context& dev_ctx,
                          const DenseTensor& dout,
                          int axis,
                          DenseTensor* ddout) {
-  phi::AddDoubleGradImpl<T>(dev_ctx,
-                            y,
-                            ddx,
-                            ddy,
-                            dout,
-                            axis,
-                            ddout,
-                            ElementwiseCompute<funcs::AddFunctor<T>, T>,
-                            ElementwiseCompute<funcs::InverseAddFunctor<T>, T>);
+  phi::AddDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 template <typename T, typename Context>
@@ -106,15 +99,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
                               const DenseTensor& dout,
                               int axis,
                               DenseTensor* ddout) {
-  phi::SubtractDoubleGradImpl<T>(
-      dev_ctx,
-      y,
-      ddx,
-      ddy,
-      dout,
-      axis,
-      ddout,
-      ElementwiseCompute<funcs::SubtractFunctor<T>, T>);
+  phi::SubtractDoubleGradImpl<T>(dev_ctx, y, ddx, ddy, dout, axis, ddout);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu
index f32d4c77d4059..1c0bafc932ee8 100644
--- a/paddle/phi/kernels/gpu/logical_kernel.cu
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
@@ -16,9 +16,8 @@
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/logical_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index fc73ccca6de18..af9d5574aa9fe 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/phi/kernels/math_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#include "paddle/phi/kernels/gpu/elementwise.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
 #ifdef __NVCC__
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 460e74b58166a..ac7d6fd1a0e9c 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 namespace phi {
@@ -47,19 +47,14 @@ void AddGradImpl(const Context& dev_ctx,
   }
 }
 
-template <typename T,
-          typename Context,
-          typename GradFunc,
-          typename GradInverseFunc>
+template <typename T, typename Context>
 void AddDoubleGradImpl(const Context& dev_ctx,
                        const DenseTensor& y,
                        const paddle::optional<const DenseTensor&>& ddx,
                        const paddle::optional<const DenseTensor&>& ddy,
                        const DenseTensor& dout,
                        int axis,
-                       DenseTensor* ddout,
-                       GradFunc grad_func,
-                       GradInverseFunc grad_inverse_func) {
+                       DenseTensor* ddout) {
   // ddOut = ddx + ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -72,28 +67,28 @@ void AddDoubleGradImpl(const Context& dev_ctx,
     auto ddx_dims = ddx_safe.dims();
     auto ddy_dims = ddy_safe.dims();
     if (ddx_dims.size() >= ddy_dims.size()) {
-      grad_func(
+      funcs::ElementwiseCompute<funcs::AddFunctor<T>, T>(
           dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor<T>(), ddout);
     } else {
-      grad_inverse_func(dev_ctx,
-                        ddx_safe,
-                        ddy_safe,
-                        axis,
-                        funcs::InverseAddFunctor<T>(),
-                        ddout);
+      funcs::ElementwiseCompute<funcs::InverseAddFunctor<T>, T>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          axis,
+          funcs::InverseAddFunctor<T>(),
+          ddout);
     }
   }
 }
 
-template <typename T, typename Context, typename GradFunc>
+template <typename T, typename Context>
 void SubtractDoubleGradImpl(const Context& dev_ctx,
                             const DenseTensor& y,
                             const paddle::optional<const DenseTensor&>& ddx,
                             const paddle::optional<const DenseTensor&>& ddy,
                             const DenseTensor& dout,
                             int axis,
-                            DenseTensor* ddout,
-                            GradFunc grad_func) {
+                            DenseTensor* ddout) {
   // DDOut = ddx - ddy
   if (ddout) {
     DenseTensor ddx_safe, ddy_safe;
@@ -103,7 +98,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx,
         dev_ctx, y, ddy.get_ptr(), &ddy_safe);
 
     ddout->mutable_data<T>(dev_ctx.GetPlace());
-    grad_func(
+    funcs::ElementwiseCompute<funcs::SubtractFunctor<T>, T>(
         dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor<T>(), ddout);
   }
 }

From 2e6548a9cd2224e1a4b89c1351f1078273f98328 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 2 Mar 2022 18:40:00 +0800
Subject: [PATCH 22/41] vec scale kernel (#40011)

---
 .../optimizers/distributed_fused_lamb_op.cu   | 49 +++++++++++++++----
 1 file changed, 39 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index ca0828a6f6ab7..8bb4606ffff15 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -304,14 +304,30 @@ struct AndFunctor {
   HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; }
 };
 
-template <typename T1, typename T2>
+template <typename T1, typename T2, int VecSize>
 static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x,
                                        const T2 *__restrict__ scale,
                                        T1 *__restrict__ y, int num) {
   static_assert(sizeof(T1) <= sizeof(T2),
                 "sizeof(T1) must be not greater than sizeof(T2).");
   T2 s = scale[0];
-  CUDA_KERNEL_LOOP(i, num) {
+
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
+
+  for (; i + VecSize <= num; i += stride) {
+    platform::AlignedVector<T1, VecSize> x_vec;
+    platform::AlignedVector<T1, VecSize> y_vec;
+
+    platform::Load(x + i, &x_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      y_vec[j] = static_cast<T1>(static_cast<T2>(x_vec[j]) * s);
+    }
+    platform::Store(y_vec, y + i);
+  }
+
+  for (; i < num; ++i) {
     y[i] = static_cast<T1>(static_cast<T2>(x[i]) * s);
   }
 }
@@ -396,7 +412,6 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
   for (; i + VecSize <= num; i += stride) {
     platform::AlignedVector<T, VecSize> param_vec;
     platform::AlignedVector<GradT, VecSize> grad_vec;
-    platform::AlignedVector<T, VecSize> weight_decay_vec;
     platform::AlignedVector<T, VecSize> mom1_vec;
     platform::AlignedVector<T, VecSize> mom2_vec;
     platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
@@ -760,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype,
   return false;
 }
 
+template <typename T1, typename T2>
+static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx,
+                              const T1 *x, const T2 *scale, T1 *y, int n,
+                              gpuStream_t stream) {
+  int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0));
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size);
+
+#define PD_LAMB_VEC_SCALE_KERNEL_CASE                                          \
+  do {                                                                         \
+    ScaleCUDAKernel<T1, T2, kVecSize><<<config.block_per_grid,                 \
+                                        config.thread_per_block, 0, stream>>>( \
+        x, scale, y, n);                                                       \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE);
+#undef PD_LAMB_VEC_SCALE_KERNEL_CASE
+}
+
 template <typename T>
 static void NCCLReduceScatterWithScale(
     const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks,
@@ -775,10 +808,8 @@ static void NCCLReduceScatterWithScale(
       PADDLE_ENFORCE_EQ(nranks, 1,
                         platform::errors::InvalidArgument(
                             "nranks must be 1 when scale != nullptr."));
-      auto numel = recvcount * nranks;
-      auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-      ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                        stream>>>(sendbuff, scale, recvbuff, numel);
+      LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks,
+                        stream);
     }
     return;
   }
@@ -792,9 +823,7 @@ static void NCCLReduceScatterWithScale(
   if (scale && !should_destroy_op) {
     size_t numel = recvcount * nranks;
     T *new_sendbuff = buffer.Alloc<T>(numel);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel);
-    ScaleCUDAKernel<<<config.block_per_grid, config.thread_per_block, 0,
-                      stream>>>(sendbuff, scale, new_sendbuff, numel);
+    LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream);
     sendbuff = new_sendbuff;
   }
 

From 09258040e2584f4afd9114b994710232e6769970 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 2 Mar 2022 18:50:26 +0800
Subject: [PATCH 23/41] Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the
 phi library (#40043)

* move gather.h gather.cu.h scatter.h scatter.cu.h to phi library

* fix CI

* fix rocm ci
---
 .../fluid/operators/detection/bbox_util.cu.h  |   1 -
 .../detection/collect_fpn_proposals_op.cu     |  10 +-
 .../detection/collect_fpn_proposals_op.h      |   6 +-
 .../detection/distribute_fpn_proposals_op.cu  |   5 +-
 .../detection/distribute_fpn_proposals_op.h   |  15 +-
 .../detection/generate_mask_labels_op.cc      |   1 -
 .../detection/generate_proposal_labels_op.cc  |  16 +-
 .../detection/generate_proposals_op.cc        |  18 +-
 .../detection/generate_proposals_op.cu        |   9 +-
 .../detection/generate_proposals_v2_op.cc     |  18 +-
 .../detection/generate_proposals_v2_op.cu     |   9 +-
 paddle/fluid/operators/gather_nd_op.cu        |  94 +++++-----
 paddle/fluid/operators/gather_nd_op.h         |  66 ++++---
 paddle/fluid/operators/gather_op.cu           |  32 ++--
 paddle/fluid/operators/gather_op.h            |  68 +++----
 paddle/fluid/operators/gather_test.cc         |   4 +-
 paddle/fluid/operators/grid_sampler_op.h      |   1 -
 .../fluid/operators/math/segment_pooling.cu   |   6 +-
 paddle/fluid/operators/scatter_nd_add_op.cu   |  41 ++--
 paddle/fluid/operators/scatter_nd_add_op.h    |  41 ++--
 paddle/fluid/operators/scatter_op.cu          |  50 +++--
 paddle/fluid/operators/scatter_op.h           |  63 +++----
 paddle/fluid/operators/scatter_test.cc        |   4 +-
 paddle/fluid/operators/segment_pool_op.cu     |   1 -
 .../sequence_ops/sequence_scatter_op.cc       |   2 -
 .../sequence_ops/sequence_scatter_op.h        |   3 +-
 paddle/fluid/operators/viterbi_decode_op.cu   |  38 ++--
 paddle/fluid/operators/viterbi_decode_op.h    | 128 +++++++------
 .../kernels/funcs}/gather.cu.h                | 176 +++++++++++-------
 .../operators => phi/kernels/funcs}/gather.h  | 114 +++++++-----
 .../kernels/funcs}/scatter.cu.h               | 124 ++++++------
 .../operators => phi/kernels/funcs}/scatter.h | 165 ++++++++--------
 32 files changed, 702 insertions(+), 627 deletions(-)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.cu.h (62%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/gather.h (72%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.cu.h (67%)
 rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.h (65%)

diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index b361bc3ab75e8..f170fbbe4b534 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index ce9ac3de4e78c..860fdd01794cc 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -23,11 +23,11 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     sorted_rois.mutable_data<T>({real_post_num, kBBoxSize}, dev_ctx.GetPlace());
     Tensor sorted_batch_id;
     sorted_batch_id.mutable_data<int>({real_post_num}, dev_ctx.GetPlace());
-    GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
-    GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
-                   &sorted_batch_id);
+    phi::funcs::GPUGather<T>(dev_ctx, concat_rois, index_out_t, &sorted_rois);
+    phi::funcs::GPUGather<int>(dev_ctx, roi_batch_id_list_gpu, index_out_t,
+                               &sorted_batch_id);
 
     Tensor batch_index_t;
     int* batch_idx_in =
@@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
         out_id_data, batch_idx_in, index_out_t.data<int>(), real_post_num, 0,
         sizeof(int) * 8, dev_ctx.stream());
 
-    GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
+    phi::funcs::GPUGather<T>(dev_ctx, sorted_rois, index_out_t, fpn_rois);
 
     Tensor length_lod;
     int* length_lod_data =
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index a60f881ebf3e3..e5ae9a6ccbda5 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -21,7 +21,6 @@ limitations under the License.*/
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
 
     auto multi_layer_scores =
         context.MultiInput<paddle::framework::LoDTensor>("MultiLevelScores");
-    auto multi_rois_num = context.MultiInput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiInput<framework::Tensor>("MultiLevelRoIsNum");
     int num_size = multi_rois_num.size();
 
     auto* fpn_rois = context.Output<paddle::framework::LoDTensor>("FpnRois");
@@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     }
     num_per_batch.emplace_back(post_nms_topN - pre_idx);
     if (context.HasOutput("RoisNum")) {
-      auto* rois_num = context.Output<Tensor>("RoisNum");
+      auto* rois_num = context.Output<framework::Tensor>("RoisNum");
       int* rois_num_data =
           rois_num->mutable_data<int>({batch_size}, context.GetPlace());
       for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index c117fbd70f528..7ad25e003b491 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -24,9 +24,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
         start = end;
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
-        GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+        phi::funcs::GPUGather<T>(dev_ctx, *fpn_rois, sub_idx,
+                                 multi_fpn_rois[i]);
       } else {
         multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
                                            dev_ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index 628cbcd761186..5479e08c2a5ef 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -28,10 +27,11 @@ namespace operators {
 
 const int kBoxDim = 4;
 
-inline std::vector<size_t> GetLodFromRoisNum(const Tensor* rois_num) {
+inline std::vector<size_t> GetLodFromRoisNum(
+    const framework::Tensor* rois_num) {
   std::vector<size_t> rois_lod;
   auto* rois_num_data = rois_num->data<int>();
-  Tensor cpu_tensor;
+  framework::Tensor cpu_tensor;
   if (platform::is_gpu_place(rois_num->place())) {
     paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(),
                                       &cpu_tensor);
@@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<size_t> fpn_rois_lod;
     int fpn_rois_num;
     if (context.HasInput("RoisNum")) {
-      auto* rois_num = context.Input<Tensor>("RoisNum");
+      auto* rois_num = context.Input<framework::Tensor>("RoisNum");
       fpn_rois_lod = GetLodFromRoisNum(rois_num);
     } else {
       fpn_rois_lod = fpn_rois->lod().back();
@@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> num_rois_level(num_level, 0);
     std::vector<int> num_rois_level_integral(num_level + 1, 0);
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
@@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     std::vector<int> restore_index_inter(fpn_rois_num, -1);
     // distribute the rois into different fpn level by target level
     for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
-      Tensor fpn_rois_slice =
+      auto fpn_rois_slice =
           fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
       const T* rois_data = fpn_rois_slice.data<T>();
       size_t cur_offset = fpn_rois_lod[i];
@@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     for (int i = 0; i < fpn_rois_num; ++i) {
       restore_index_data[restore_index_inter[i]] = i;
     }
-    auto multi_rois_num = context.MultiOutput<Tensor>("MultiLevelRoIsNum");
+    auto multi_rois_num =
+        context.MultiOutput<framework::Tensor>("MultiLevelRoIsNum");
     if (multi_rois_num.size() > 0) {
       int batch_size = fpn_rois_lod.size() - 1;
       for (int i = 0; i < num_level; ++i) {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index e6af1a5bbf71c..c9cc4e722071c 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/mask_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 424aa0714400d..cbf17048400bf 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 
   Tensor fg_boxes, bg_boxes, fg_labels, bg_labels;
   fg_boxes.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, fg_inds_t, &fg_boxes);
   bg_boxes.mutable_data<T>({bg_num, kBoxDim}, context.GetPlace());
-  CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
+  phi::funcs::CPUGather<T>(context, boxes, bg_inds_t, &bg_boxes);
   Concat<T>(context, fg_boxes, bg_boxes, sampled_boxes);
-  CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
+  phi::funcs::CPUGather<T>(context, gt_boxes, gt_box_inds_t, sampled_gts);
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
-  CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
+  phi::funcs::CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
   phi::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
   fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
   bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
-  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  phi::funcs::CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
   Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
 
@@ -334,7 +334,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     } else {
       proposals_num = keep.numel();
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
+      phi::funcs::CPUGather<T>(context, rpn_rois, keep, &roi_filter);
     }
     T* roi_filter_dt = roi_filter.data<T>();
     memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 8c4bd4ac61320..d6130823271f0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 6e3c322c17483..5fb7973fd89e4 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -85,8 +86,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -102,8 +103,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 6351ea865cd0e..1f1802574c5b8 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
-#include "paddle/fluid/operators/gather.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
 
-    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
-    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
-    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
-    CPUGather<T>(ctx, variances, index_t, &var_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    phi::funcs::CPUGather<T>(ctx, variances, index_t, &var_sel);
 
     Tensor proposals;
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
@@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     Tensor scores_filter;
     bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
-    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    phi::funcs::CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    phi::funcs::CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
     if (nms_thresh <= 0) {
       return std::make_pair(bbox_sel, scores_filter);
     }
@@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
 
     proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
     scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
-    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+    phi::funcs::CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    phi::funcs::CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
 
     return std::make_pair(proposals, scores_sel);
   }
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index 93ba3deca5fc4..005309e8ee577 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -86,8 +87,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   }
   proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
   scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
-  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+  phi::funcs::GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  phi::funcs::GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
 
   if (nms_thresh <= 0) {
     return std::make_pair(proposals_filter, scores_filter);
@@ -104,8 +105,8 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_nms, proposals_nms;
   proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
   scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
-  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
-  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+  phi::funcs::GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  phi::funcs::GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
 
   return std::make_pair(proposals_nms, scores_nms);
 }
diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu
index 0de2798bf7509..338c441161834 100644
--- a/paddle/fluid/operators/gather_nd_op.cu
+++ b/paddle/fluid/operators/gather_nd_op.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_nd_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUGatherNd<DeviceContext, T, int>(ctx, *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGatherNd<DeviceContext, T, int64_t>(ctx, *x, *index, output);
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
@@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
 
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *dO, *index, dX);
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
@@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-using CUDA = paddle::platform::CUDADeviceContext;
-REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdOpCUDAKernel<CUDA, int16_t>,
-                        ops::GatherNdOpCUDAKernel<CUDA, bool>,
-                        ops::GatherNdOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel<float>,
+                        ops::GatherNdOpCUDAKernel<double>,
+                        ops::GatherNdOpCUDAKernel<int64_t>,
+                        ops::GatherNdOpCUDAKernel<int>,
+                        ops::GatherNdOpCUDAKernel<int16_t>,
+                        ops::GatherNdOpCUDAKernel<bool>,
+                        ops::GatherNdOpCUDAKernel<plat::float16>);
 
-REGISTER_OP_CUDA_KERNEL(gather_nd_grad,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, float>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, double>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int64_t>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, int>,
-                        ops::GatherNdGradOpCUDAKernel<CUDA, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel<float>,
+                        ops::GatherNdGradOpCUDAKernel<double>,
+                        ops::GatherNdGradOpCUDAKernel<int64_t>,
+                        ops::GatherNdGradOpCUDAKernel<int>,
+                        ops::GatherNdGradOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h
index f458c0e18013b..d54261008e47b 100644
--- a/paddle/fluid/operators/gather_nd_op.h
+++ b/paddle/fluid/operators/gather_nd_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGatherNd<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGatherNd<T, int64_t>(ctx.device_context(), *x, *index, output);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGatherNd<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_EQ(
         platform::is_cpu_place(ctx.GetPlace()), true,
         platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+
     auto *index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
 
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s]",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *dO, *index, dX);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      ScatterNdAdd<T, int64_t>(ctx, *dO, *index, dX);
+    auto index_type = index->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s]",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index a502a13040949..8f1d9284c5038 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     }
     const auto &place = ctx.GetPlace();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &dev_ctx = ctx.cuda_device_context();
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2CUDAFunction<T, int32_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int32_t>(x, index, axis, output,
+                                                     dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2CUDAFunction<T, int64_t>(x, index, axis, output, place, ctx);
+        phi::funcs::GatherV2CUDAFunction<T, int64_t>(x, index, axis, output,
+                                                     dev_ctx);
       }
       return;
     }
@@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel<T> {
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUGather<T, int>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int>(dev_ctx, *x, *index, output);
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+      phi::funcs::GPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
+    const auto &dev_ctx = ctx.cuda_device_context();
     const auto &index_type = framework::TransToProtoVarType(index->dtype());
     if (axis != 0) {
       if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int32_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
-                                             ctx.GetPlace(), ctx);
+        phi::funcs::GatherV2GradCUDAFunction<T, int64_t>(dO, index, axis, dX,
+                                                         dev_ctx);
       }
       return;
     }
@@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int>(ctx, *dO, *index, dX,
-                               ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int>(dev_ctx, *dO, *index, dX,
+                                           ctx.Attr<bool>("overwrite"));
     } else if (index_type == framework::proto::VarType::INT64) {
-      GPUScatterAssign<T, int64_t>(ctx, *dO, *index, dX,
-                                   ctx.Attr<bool>("overwrite"));
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX,
+                                               ctx.Attr<bool>("overwrite"));
     }
   }
 };
diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h
index 016c2b398daaa..94de694b2f9bc 100644
--- a/paddle/fluid/operators/gather_op.h
+++ b/paddle/fluid/operators/gather_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel<T> {
     // get axis from tensor
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &place = ctx.GetPlace();
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2Function<T, int32_t>(x, index, axis, output, place);
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2Function<T, int64_t>(x, index, axis, output, place);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2Function<T, int32_t>(dev_ctx, x, index, axis,
+                                                 output);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2Function<T, int64_t>(dev_ctx, x, index, axis,
+                                                 output);
       }
       return;
     }
 
     output->mutable_data<T>(ctx.GetPlace());
     if (x->numel() == 0) return;
-    if (index_type == framework::proto::VarType::INT32) {
-      CPUGather<T, int>(ctx.device_context(), *x, *index, output);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      CPUGather<T, int64_t>(ctx.device_context(), *x, *index, output);
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::CPUGather<T, int>(dev_ctx, *x, *index, output);
+    } else if (index_type == phi::DataType::INT64) {
+      phi::funcs::CPUGather<T, int64_t>(dev_ctx, *x, *index, output);
     }
   }
 };
@@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel<T> {
     int axis = ctx.Attr<int>("axis");
     if (ctx.HasInput("Axis")) {
       const Tensor *axis_tensor = ctx.Input<Tensor>("Axis");
-      const auto &axis_type =
-          framework::TransToProtoVarType(axis_tensor->dtype());
-      if (axis_type == framework::proto::VarType::INT32) {
+      const auto &axis_type = axis_tensor->dtype();
+      if (axis_type == phi::DataType::INT32) {
         axis = static_cast<int>(axis_tensor->data<int32_t>()[0]);
-      } else if (axis_type == framework::proto::VarType::INT64) {
+      } else if (axis_type == phi::DataType::INT64) {
         axis = static_cast<int>(axis_tensor->data<int64_t>()[0]);
       }
     }
-    const auto &index_type = framework::TransToProtoVarType(index->dtype());
+    const auto &index_type = index->dtype();
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
 
     if (axis != 0) {
-      if (index_type == framework::proto::VarType::INT32) {
-        GatherV2GradFunction<T, int32_t>(dO, index, axis, dX, ctx.GetPlace());
-      } else if (index_type == framework::proto::VarType::INT64) {
-        GatherV2GradFunction<T, int64_t>(dO, index, axis, dX, ctx.GetPlace());
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GatherV2GradFunction<T, int32_t>(dev_ctx, dO, index, axis,
+                                                     dX);
+      } else if (index_type == phi::DataType::INT64) {
+        phi::funcs::GatherV2GradFunction<T, int64_t>(dev_ctx, dO, index, axis,
+                                                     dX);
       }
       return;
     }
 
     dX->mutable_data<T>(ctx.GetPlace());
     auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
+    auto &place = *dev_ctx.eigen_device();
     dxt.device(place) = dxt.constant(static_cast<T>(0));
     if (dO->numel() == 0) return;
     bool overwrite = ctx.Attr<bool>("overwrite");
 
-    if (index_type == framework::proto::VarType::INT32) {
+    if (index_type == phi::DataType::INT32) {
       if (overwrite) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int32_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *dO, *index, dX);
       }
-    } else if (index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == phi::DataType::INT64) {
       if (overwrite) {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *dO, *index, dX);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *dO, *index, dX);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *dO, *index, dX);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *dO, *index, dX);
       }
     }
   }
diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc
index 0f3dcdadcf897..c962dd065234f 100644
--- a/paddle/fluid/operators/gather_test.cc
+++ b/paddle/fluid/operators/gather_test.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 
 TEST(Gather, GatherData) {
   paddle::framework::Tensor* src = new paddle::framework::Tensor();
@@ -39,7 +39,7 @@ TEST(Gather, GatherData) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::CPUGather<int>(ctx, *src, *index, output);
+  phi::funcs::CPUGather<int>(ctx, *src, *index, output);
   delete cpu_place;
   cpu_place = NULL;
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index 8f3c6660f51c4..93e96694270a4 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <utility>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index bb6d8756bd0a3..fbdcb99c02ab9 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -379,9 +379,9 @@ class SegmentPoolGradFunctor<platform::CUDADeviceContext, T, IndexT> {
       SimpleDiv<T><<<config.block_per_grid.x, config.thread_per_block.x, 0,
                      context.stream()>>>(mean_grad.data<T>(),
                                          summed_ids->data<T>(), len, dim);
-      GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, mean_grad, segments, in_grad);
     } else if (pooltype == "SUM") {
-      GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
+      phi::funcs::GPUGather<T, IndexT>(context, out_grad, segments, in_grad);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN "
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu
index 6448f8cc4056d..2fe3fcb759d34 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cu
+++ b/paddle/fluid/operators/scatter_nd_add_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_nd_add_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel<T> {
     auto *Out = ctx.Output<Tensor>("Out");
 
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterNdAdd<DeviceContext, T, int32_t>(ctx, *Updates, *Ids, Out);
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      GPUScatterNdAdd<DeviceContext, T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::GPUScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel<T> {
     }
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
+      auto &dev_ctx = ctx.cuda_device_context();
       // Gradient by Gather
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGatherNd<DeviceContext, T, int32_t>(ctx, *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGatherNd<DeviceContext, T, int64_t>(ctx, *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h
index 2bdf9ec58a850..81c95fe55abaa 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.h
+++ b/paddle/fluid/operators/scatter_nd_add_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel<T> {
 
     // In place output: Out = X
     framework::TensorCopySync(*X, ctx.GetPlace(), Out);
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s], but "
-                          "desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s], but "
+            "desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
-    if (index_type == framework::proto::VarType::INT32) {
-      ScatterNdAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::ScatterNdAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
     } else {
-      ScatterNdAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+      phi::funcs::ScatterNdAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
     }
   }
 };
@@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel<T> {
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGatherNd<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      const auto &index_type = Ids->dtype();
+      auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGatherNd<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGatherNd<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGatherNd<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu
index 549e30803b464..7755e376bc195 100644
--- a/paddle/fluid/operators/scatter_op.cu
+++ b/paddle/fluid/operators/scatter_op.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/scatter.cu.h"
 #include "paddle/fluid/operators/scatter_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
 
 namespace paddle {
 namespace operators {
@@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
 
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // use template class to support int32_t and int64_t
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op Index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s].",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      GPUScatterAssign<T, int32_t>(ctx, *Updates, *Ids, Out, overwrite);
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.cuda_device_context();
+    if (index_type == phi::DataType::INT32) {
+      phi::funcs::GPUScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     } else {
-      GPUScatterAssign<T, int64_t>(ctx, *Updates, *Ids, Out, overwrite);
+      phi::funcs::GPUScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out,
+                                               overwrite);
     }
   }
 };
@@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    auto index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.cuda_device_context();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        GPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::GPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        GPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::GPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        GPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::GPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h
index 69ab6c7135cd5..7733181a93fb6 100644
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/gather.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
@@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel<T> {
     // In place output: Out = X, Out[Ids] = Updates
     framework::TensorCopy(*X, ctx.GetPlace(), Out);
     // Apply ScatterUpdate: Out[index] = Updates[:]
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Index holds the wrong type, it holds [%s],"
-                          "but desires to be [%s] or [%s].",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
+    PADDLE_ENFORCE_EQ(
+        index_type_match, true,
+        platform::errors::InvalidArgument(
+            "Index holds the wrong type, it holds [%s],"
+            "but desires to be [%s] or [%s].",
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (overwrite) {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssign<T, int32_t>(ctx.device_context(), *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssign<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssign<T, int64_t>(ctx.device_context(), *Updates, *Ids, Out);
+        phi::funcs::ScatterAssign<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     } else {
-      if (index_type == framework::proto::VarType::INT32) {
-        ScatterAssignAdd<T, int32_t>(ctx, *Updates, *Ids, Out);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::ScatterAssignAdd<T, int32_t>(dev_ctx, *Updates, *Ids, Out);
       } else {
-        ScatterAssignAdd<T, int64_t>(ctx, *Updates, *Ids, Out);
+        phi::funcs::ScatterAssignAdd<T, int64_t>(dev_ctx, *Updates, *Ids, Out);
       }
     }
   }
@@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
     auto *Ids = ctx.Input<Tensor>("Ids");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
-    const auto &index_type = framework::TransToProtoVarType(Ids->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
+    const auto &index_type = Ids->dtype();
+    bool index_type_match = index_type == phi::DataType::INT32 ||
+                            index_type == phi::DataType::INT64;
     PADDLE_ENFORCE_EQ(
         index_type_match, true,
         platform::errors::InvalidArgument(
             "scatter_op index holds the wrong type, it holds [%s],"
             "but desires to be [%s] or [%s]",
-            paddle::framework::DataTypeToString(index_type),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT32),
-            paddle::framework::DataTypeToString(
-                framework::proto::VarType::INT64)));
+            index_type, phi::DataType::INT32, phi::DataType::INT64));
 
+    auto &dev_ctx = ctx.template device_context<phi::CPUContext>();
     if (dX) {
       framework::TensorCopy(*dOut, ctx.GetPlace(), dX);
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUScatterGradForX<T, int32_t>(ctx.device_context(), *Ids, dX);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUScatterGradForX<T, int32_t>(dev_ctx, *Ids, dX);
       } else {
-        CPUScatterGradForX<T, int64_t>(ctx.device_context(), *Ids, dX);
+        phi::funcs::CPUScatterGradForX<T, int64_t>(dev_ctx, *Ids, dX);
       }
     }
 
     if (dUpdates) {
       dUpdates->mutable_data<T>(ctx.GetPlace());
       // Gradient by Gather: dUpdates = dO[Ids]
-      if (index_type == framework::proto::VarType::INT32) {
-        CPUGather<T, int32_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+      if (index_type == phi::DataType::INT32) {
+        phi::funcs::CPUGather<T, int32_t>(dev_ctx, *dOut, *Ids, dUpdates);
       } else {
-        CPUGather<T, int64_t>(ctx.device_context(), *dOut, *Ids, dUpdates);
+        phi::funcs::CPUGather<T, int64_t>(dev_ctx, *dOut, *Ids, dUpdates);
       }
     }
   }
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index 0a4cab5fac1ab..93f2d60e5f232 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 #include <gtest/gtest.h>
 
@@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) {
 
   auto* cpu_place = new paddle::platform::CPUPlace();
   paddle::platform::CPUDeviceContext ctx(*cpu_place);
-  paddle::operators::ScatterAssign<float>(ctx, src, index, &output);
+  phi::funcs::ScatterAssign<float>(ctx, src, index, &output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu
index 4e20844dc3275..e147e62a98354 100644
--- a/paddle/fluid/operators/segment_pool_op.cu
+++ b/paddle/fluid/operators/segment_pool_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/segment_pool_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
index 2d4730635fd2a..25c12ab565a14 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc
@@ -16,8 +16,6 @@ limitations under the License. */
 #include <memory>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
index 365381abc4683..2960b77d5ac0f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
+#include "paddle/phi/kernels/funcs/scatter.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu
index 3c546dd8156a2..68628fb2748c4 100644
--- a/paddle/fluid/operators/viterbi_decode_op.cu
+++ b/paddle/fluid/operators/viterbi_decode_op.cu
@@ -11,8 +11,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-#include "paddle/fluid/operators/gather.cu.h"
 #include "paddle/fluid/operators/viterbi_decode_op.h"
+#include "paddle/phi/kernels/funcs/gather.cu.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) {
 
 template <template <typename T> typename BinaryFunctor, typename T>
 struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
-  void operator()(const platform::CUDADeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
-    std::vector<const Tensor*> ins{&lhs, &rhs};
-    std::vector<Tensor*> outs{output};
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* output) {
+    std::vector<const framework::Tensor*> ins{&lhs, &rhs};
+    std::vector<framework::Tensor*> outs{output};
     paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T,
                                                    T>(dev_ctx, ins, &outs, -1,
                                                       BinaryFunctor<T>());
@@ -75,10 +76,11 @@ struct BinaryOperation<platform::CUDADeviceContext, BinaryFunctor, T> {
 template <template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask<platform::CUDADeviceContext, CompareFunctor, T> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
-    std::vector<const Tensor*> ins = {&lhs, &rhs};
-    std::vector<Tensor*> outs = {mask};
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
+    std::vector<const framework::Tensor*> ins = {&lhs, &rhs};
+    std::vector<framework::Tensor*> outs = {mask};
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(
         dev_ctx, ins, &outs, CompareFunctor<int64_t, T>());
@@ -131,8 +133,9 @@ struct ARange<platform::CUDADeviceContext> {
 
 template <typename T, typename IndType>
 struct Argmax<platform::CUDADeviceContext, T, IndType> {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t numel = input.numel();
     int64_t groups = numel / input_dims[axis];
@@ -166,8 +169,8 @@ struct Argmax<platform::CUDADeviceContext, T, IndType> {
 template <typename T>
 struct GetMaxValue<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& dev_ctx,
-                  const Tensor& input, T* max_value) {
-    Tensor out_data;
+                  const framework::Tensor& input, T* max_value) {
+    framework::Tensor out_data;
     out_data.Resize(phi::make_ddim({1}));
     out_data.mutable_data<T>(platform::CUDAPlace());
     switch (ComputeBlockSize(input.numel())) {
@@ -177,7 +180,7 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
               1, input.numel(), 1, input.data<int64_t>(), nullptr,
               out_data.data<int64_t>()));
     }
-    Tensor max_value_tensor;
+    framework::Tensor max_value_tensor;
     framework::TensorCopy(out_data, platform::CPUPlace(), &max_value_tensor);
     *max_value = max_value_tensor.data<T>()[0];
   }
@@ -185,9 +188,10 @@ struct GetMaxValue<platform::CUDADeviceContext, T> {
 
 template <typename T, typename IndexT>
 struct Gather<platform::CUDADeviceContext, T, IndexT> {
-  void operator()(const platform::CUDADeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    GPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& src, const framework::Tensor& index,
+                  framework::Tensor* output) {
+    phi::funcs::GPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index bf12a03e7b4dc..0974177e6c736 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_functor.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/phi/kernels/funcs/gather.h"
 #ifdef PADDLE_WITH_MKLML
 #include <omp.h>
 #endif
@@ -28,12 +28,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using LoDTensor = framework::LoDTensor;
-
 template <typename DeviceContext, typename T, typename IndType>
 struct Argmax {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& input,
-                  Tensor* out_idx, Tensor* out, int axis) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out_idx,
+                  framework::Tensor* out, int axis) {
     framework::DDim input_dims = input.dims();
     int64_t pre = 1;
     int64_t post = 1;
@@ -82,7 +81,7 @@ struct ARange {
 
 template <typename DeviceContext, typename T>
 struct GetMaxValue {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& input,
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& input,
                   T* max_value) {
     auto input_ptr = input.data<T>();
     auto num = input.numel();
@@ -92,14 +91,15 @@ struct GetMaxValue {
 
 template <typename DeviceContext, typename T, typename IndexT = int>
 struct Gather {
-  void operator()(const DeviceContext& ctx, const Tensor& src,
-                  const Tensor& index, Tensor* output) {
-    CPUGather<T, IndexT>(ctx, src, index, output);
+  void operator()(const DeviceContext& ctx, const framework::Tensor& src,
+                  const framework::Tensor& index, framework::Tensor* output) {
+    phi::funcs::CPUGather<T, IndexT>(ctx, src, index, output);
   }
 };
 
 template <typename T, typename Functor, typename OutT = T>
-void SameDimsBinaryOP(const Tensor& lhs, const Tensor& rhs, Tensor* out) {
+void SameDimsBinaryOP(const framework::Tensor& lhs,
+                      const framework::Tensor& rhs, framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   OutT* out_ptr = out->data<OutT>();
@@ -116,8 +116,9 @@ template <typename DeviceContext,
           template <typename InT, typename OutT> typename CompareFunctor,
           typename T>
 struct GetMask {
-  void operator()(const framework::ExecutionContext& ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* mask) {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& lhs, const framework::Tensor& rhs,
+                  framework::Tensor* mask) {
     SameDimsBinaryOP<int64_t, CompareFunctor<int64_t, T>, T>(lhs, rhs, mask);
   }
 };
@@ -161,8 +162,9 @@ struct GetInputIndex<false> {
 };
 
 template <typename T, typename Functor, bool is_multi_threads = false>
-void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
-                             Tensor* out) {
+void SimpleBroadcastBinaryOP(const framework::Tensor& lhs,
+                             const framework::Tensor& rhs,
+                             framework::Tensor* out) {
   const T* lhs_ptr = lhs.data<T>();
   const T* rhs_ptr = rhs.data<T>();
   T* out_ptr = out->data<T>();
@@ -200,8 +202,8 @@ void SimpleBroadcastBinaryOP(const Tensor& lhs, const Tensor& rhs,
 template <typename DeviceContext, template <typename T> typename BinaryFunctor,
           typename T>
 struct BinaryOperation {
-  void operator()(const DeviceContext& dev_ctx, const Tensor& lhs,
-                  const Tensor& rhs, Tensor* output) {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor& lhs,
+                  const framework::Tensor& rhs, framework::Tensor* output) {
     if (lhs.dims() == rhs.dims()) {
       SameDimsBinaryOP<T, BinaryFunctor<T>>(lhs, rhs, output);
     } else {
@@ -222,20 +224,21 @@ struct BinaryOperation {
 
 class TensorBuffer {
  public:
-  explicit TensorBuffer(const LoDTensor& in) : buffer_(in), offset_(0) {
+  explicit TensorBuffer(const framework::LoDTensor& in)
+      : buffer_(in), offset_(0) {
     buffer_.Resize({buffer_.numel()});
   }
-  Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
+  framework::Tensor GetBufferBlock(std::initializer_list<int64_t> shape) {
     int64_t size = std::accumulate(shape.begin(), shape.end(), 1,
                                    std::multiplies<int64_t>());
-    Tensor block = buffer_.Slice(offset_, offset_ + size);
+    framework::Tensor block = buffer_.Slice(offset_, offset_ + size);
     offset_ += size;
     block.Resize(shape);
     return block;
   }
 
  private:
-  LoDTensor buffer_;  // need to resize 1-D Tensor
+  framework::LoDTensor buffer_;  // need to resize 1-D Tensor
   int offset_;
 };
 
@@ -246,17 +249,17 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     bool include_bos_eos_tag = ctx.Attr<bool>("include_bos_eos_tag");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     auto curr_place = ctx.GetPlace();
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<framework::Tensor>("Input");
     auto batch_size = static_cast<int>(input->dims()[0]);
     auto seq_len = static_cast<int>(input->dims()[1]);
     auto n_labels = static_cast<int>(input->dims()[2]);
     phi::funcs::SetConstant<DeviceContext, T> float_functor;
     phi::funcs::SetConstant<DeviceContext, int64_t> int_functor;
-    std::vector<Tensor> historys;
+    std::vector<framework::Tensor> historys;
     // We create tensor buffer in order to avoid allocating memory frequently
     // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
     int buffer_size = batch_size * (n_labels + 1) * seq_len + 10 * batch_size;
-    LoDTensor int_buffer;
+    framework::LoDTensor int_buffer;
     int_buffer.Resize(phi::make_ddim({buffer_size}));
     int_buffer.mutable_data<int64_t>(ctx.GetPlace());
     TensorBuffer int_tensor_buffer(int_buffer);
@@ -264,64 +267,78 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     // 10 means allocate 10*batch_size*n_labels bytes, such as alpha, alpha_max
     buffer_size = batch_size * (seq_len + 10) * n_labels +
                   (batch_size + 2) * n_labels * n_labels;
-    LoDTensor float_buffer;
+    framework::LoDTensor float_buffer;
     float_buffer.Resize(phi::make_ddim({buffer_size}));
     float_buffer.mutable_data<T>(ctx.GetPlace());
     TensorBuffer float_tensor_buffer(float_buffer);
-    auto* length = ctx.Input<Tensor>("Length");
-    Tensor left_length = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    auto* length = ctx.Input<framework::Tensor>("Length");
+    framework::Tensor left_length =
+        int_tensor_buffer.GetBufferBlock({batch_size, 1});
     framework::TensorCopy(*length, curr_place, dev_ctx, &left_length);
     int64_t max_seq_len = 0;
     GetMaxValue<DeviceContext, int64_t> get_max_value;
     get_max_value(dev_ctx, left_length, &max_seq_len);
 
-    auto* scores = ctx.Output<Tensor>("Scores");
+    auto* scores = ctx.Output<framework::Tensor>("Scores");
     scores->mutable_data<T>(curr_place);
-    auto* path = ctx.Output<Tensor>("Path");
+    auto* path = ctx.Output<framework::Tensor>("Path");
     path->Resize({batch_size, max_seq_len});
     path->mutable_data<int64_t>(curr_place);
-    Tensor tpath = int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
+    framework::Tensor tpath =
+        int_tensor_buffer.GetBufferBlock({max_seq_len, batch_size});
     auto batch_path = Unbind(tpath);
     for (auto it = batch_path.begin(); it != batch_path.end(); ++it) {
       it->Resize({batch_size});
     }
     // create and init required tensor
-    Tensor input_exp =
+    framework::Tensor input_exp =
         float_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     TransCompute<DeviceContext, T>(3, dev_ctx, *input, &input_exp, {1, 0, 2});
-    auto* transition = ctx.Input<Tensor>("Transition");
-    Tensor trans_exp = float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
+    auto* transition = ctx.Input<framework::Tensor>("Transition");
+    framework::Tensor trans_exp =
+        float_tensor_buffer.GetBufferBlock({n_labels, n_labels});
     framework::TensorCopy(*transition, curr_place, dev_ctx, &trans_exp);
     trans_exp.Resize({1, n_labels, n_labels});
-    Tensor alpha = float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor alpha =
+        float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
+    framework::Tensor zero = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &zero, 0);
-    Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor one = int_tensor_buffer.GetBufferBlock({batch_size, 1});
     int_functor(dev_ctx, &one, 1);
-    Tensor float_one = float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor float_one =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
     float_functor(dev_ctx, &float_one, static_cast<T>(1.0));
-    Tensor alpha_trn_sum =
+    framework::Tensor alpha_trn_sum =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels, n_labels});
-    Tensor alpha_max =
+    framework::Tensor alpha_max =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor alpha_argmax =
+    framework::Tensor alpha_argmax =
         int_tensor_buffer.GetBufferBlock({seq_len, batch_size, n_labels});
     auto alpha_argmax_unbind = Unbind(alpha_argmax);
-    Tensor alpha_nxt =
+    framework::Tensor alpha_nxt =
         float_tensor_buffer.GetBufferBlock({batch_size, n_labels});
-    Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor zero_len_mask = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor float_mask = float_tensor_buffer.GetBufferBlock({batch_size, 1});
-    Tensor stop_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor start_trans = float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
-    Tensor rest_trans =
+    framework::Tensor int_mask = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor zero_len_mask =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor float_mask =
+        float_tensor_buffer.GetBufferBlock({batch_size, 1});
+    framework::Tensor stop_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor start_trans =
+        float_tensor_buffer.GetBufferBlock({1, 1, n_labels});
+    framework::Tensor rest_trans =
         float_tensor_buffer.GetBufferBlock({1, n_labels - 2, n_labels});
-    Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor last_ids_tmp = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor batch_offset = int_tensor_buffer.GetBufferBlock({batch_size});
-    Tensor gather_idx = int_tensor_buffer.GetBufferBlock({batch_size});
-    std::vector<const Tensor*> shape{&rest_trans, &stop_trans, &start_trans};
-    std::vector<Tensor*> outputs{&rest_trans, &stop_trans, &start_trans};
+    framework::Tensor last_ids = int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor last_ids_tmp =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor batch_offset =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    framework::Tensor gather_idx =
+        int_tensor_buffer.GetBufferBlock({batch_size});
+    std::vector<const framework::Tensor*> shape{&rest_trans, &stop_trans,
+                                                &start_trans};
+    std::vector<framework::Tensor*> outputs{&rest_trans, &stop_trans,
+                                            &start_trans};
     math::SplitFunctor<DeviceContext, T> split_functor;
     split_functor(dev_ctx, trans_exp, shape, 1, &outputs);
     stop_trans.Resize({1, n_labels});
@@ -346,9 +363,9 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     SubInt(dev_ctx, left_length, one, &left_length);
     Argmax<DeviceContext, T, int64_t> argmax;
     for (int64_t i = 1; i < max_seq_len; ++i) {
-      Tensor logit = input_exp.Slice(i, i + 1);
+      framework::Tensor logit = input_exp.Slice(i, i + 1);
       logit.Resize({batch_size, n_labels});
-      Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
+      framework::Tensor& alpha_exp = alpha.Resize({batch_size, n_labels, 1});
       AddFloat(dev_ctx, alpha_exp, trans_exp, &alpha_trn_sum);
       auto alpha_argmax_temp = alpha_argmax_unbind[i - 1];
       alpha_argmax_temp.Resize({batch_size, n_labels});
@@ -395,7 +412,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
       ++last_ids_index;
       AddInt(dev_ctx, left_length, one, &left_length);
       AddInt(dev_ctx, batch_offset, last_ids, &gather_idx);
-      Tensor& last_ids_update = batch_path[actual_len - last_ids_index];
+      framework::Tensor& last_ids_update =
+          batch_path[actual_len - last_ids_index];
       hist->Resize({batch_size * n_labels});
       gather(dev_ctx, *hist, gather_idx, &last_ids_update);
       GetMask<DeviceContext, GreaterThanFunctor, int64_t>()(ctx, left_length,
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/phi/kernels/funcs/gather.cu.h
similarity index 62%
rename from paddle/fluid/operators/gather.cu.h
rename to paddle/phi/kernels/funcs/gather.cu.h
index fef425c53acf2..6e31ab7f8c7c5 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/phi/kernels/funcs/gather.cu.h
@@ -13,24 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/memory/memcpy.h"
+// TODO(paddle-dev): move gpu_primitives.h to phi
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/utils/dim.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
 
-using framework::Tensor;
-using platform::DeviceContext;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
+__global__ void GatherCUDAKernel(const T* params,
+                                 const IndexT* indices,
+                                 T* output,
+                                 size_t index_size,
                                  size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -42,9 +43,12 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
-                                   const IndexT* indices, T* output,
-                                   size_t remain_size, size_t slice_size,
+__global__ void GatherNdCUDAKernel(const T* input,
+                                   const int64_t* input_dims,
+                                   const IndexT* indices,
+                                   T* output,
+                                   size_t remain_size,
+                                   size_t slice_size,
                                    size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -59,7 +63,8 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater than or equal to 0, but received [%d]",
-          input_dims[j], index_value);
+          input_dims[j],
+          index_value);
       gather_i += (index_value * temp);
       temp *= input_dims[j];
     }
@@ -76,13 +81,16 @@ __global__ void GatherNdCUDAKernel(const T* input, const int64_t* input_dims,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
+void GPUGather(const phi::GPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "If the index's rank of gather_op is 2,"
-                          " the second dimension should be 1."));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("If the index's rank of gather_op is 2,"
+                                     " the second dimension should be 1."));
   }
 
   // index size
@@ -90,7 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   if (index_size == 0) return;
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -105,18 +113,17 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   int64_t n = slice_size * index_size;
   int64_t grid = (n + block - 1) / block;
 
-  GatherCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  GatherCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUGatherNd(const framework::ExecutionContext& context,
-                 const Tensor& input, const Tensor& index, Tensor* output) {
-  const auto& ctx = context.template device_context<DeviceContext>();
+template <typename T, typename IndexT = int>
+void GPUGatherNd(const phi::GPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -143,29 +150,36 @@ void GPUGatherNd(const framework::ExecutionContext& context,
     v_input_dims[i] = input_dims[i];
   }
 
-  auto& dev_ctx = context.cuda_device_context();
+  phi::DenseTensor input_dims_tensor;
+  input_dims_tensor.Resize({input_dims_size});
+  auto* g_input_dims = ctx.Alloc<int64_t>(&input_dims_tensor);
   int64_t bytes = input_dims_size * sizeof(int64_t);
-  auto p_input_dims = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_input_dims = reinterpret_cast<int64_t*>(p_input_dims->ptr());
-  memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes,
-               ctx.stream());
+
+  paddle::memory::Copy(
+      gplace, g_input_dims, cplace, v_input_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  GatherNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_input, g_input_dims, p_index, p_output, remain_numel, slice_size,
-      end_size);
+  GatherNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(p_input,
+                                                                  g_input_dims,
+                                                                  p_index,
+                                                                  p_output,
+                                                                  remain_numel,
+                                                                  slice_size,
+                                                                  end_size);
 }
 
 template <typename T, typename U>
-__global__ void GatherGPUKernel(const T* input, const U* index, T* out,
-                                int64_t outer_dim_size, int64_t inner_dim_size,
+__global__ void GatherGPUKernel(const T* input,
+                                const U* index,
+                                T* out,
+                                int64_t outer_dim_size,
+                                int64_t inner_dim_size,
                                 int64_t out_index_dim_size,
-                                int64_t input_index_dim_size, int64_t size) {
+                                int64_t input_index_dim_size,
+                                int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   int64_t outer_size = outer_dim_size * out_index_dim_size;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
@@ -180,7 +194,8 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
         "please check whether the dimensions of index and "
         "input meet the requirements. It should "
         "be less than [%d] and greater than or equal to 0, but received [%d]",
-        input_index_dim_size, index_val);
+        input_index_dim_size,
+        index_val);
 
     int64_t out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int64_t input_index =
@@ -191,11 +206,14 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-__global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
+__global__ void GatherGradGPUKernel(const T* input,
+                                    const U* index,
+                                    T* out,
                                     int64_t outer_dim_size,
                                     int64_t inner_dim_size,
                                     int64_t input_index_dim_size,
-                                    int64_t out_index_dim_size, int64_t size) {
+                                    int64_t out_index_dim_size,
+                                    int64_t size) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
     int64_t inner_dim_index = idx / (outer_dim_size * input_index_dim_size);
@@ -210,10 +228,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out,
 }
 
 template <typename T, typename U>
-void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place,
-                          const framework::ExecutionContext& ctx) {
+void GatherV2CUDAFunction(const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out,
+                          const phi::GPUContext& ctx) {
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
   auto input_dim = input->dims();
@@ -241,24 +260,31 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
   int64_t out_size = out->numel();
   if (out_size == 0) return;
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, out_size);
+  auto stream = ctx.stream();
   GatherGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      index_size, index_dim_size, out_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      index_size,
+      index_dim_size,
+      out_size);
 }
 
 template <typename T, typename U>
-void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
-                              const int axis, Tensor* out,
-                              const paddle::platform::Place& place,
-                              const framework::ExecutionContext& ctx) {
+void GatherV2GradCUDAFunction(const DenseTensor* input,
+                              const DenseTensor* index,
+                              const int axis,
+                              DenseTensor* out,
+                              const phi::GPUContext& ctx) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -279,19 +305,25 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
-  platform::GpuLaunchConfig config =
-      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
-  auto stream = ctx.cuda_device_context().stream();
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, input_size);
+  auto stream = ctx.stream();
   GatherGradGPUKernel<
-      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
-      input_data, index_data, out_data, outer_dim_size, inner_dim_size,
-      input_index_dim_size, out_index_dim_size, input_size);
+      T,
+      U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
+      input_data,
+      index_data,
+      out_data,
+      outer_dim_size,
+      inner_dim_size,
+      input_index_dim_size,
+      out_index_dim_size,
+      input_size);
 }
-}  // namespace operators
-}  // namespace paddle
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/gather.h b/paddle/phi/kernels/funcs/gather.h
similarity index 72%
rename from paddle/fluid/operators/gather.h
rename to paddle/phi/kernels/funcs/gather.h
index 46f78b16ef36b..740042c999aa9 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/phi/kernels/funcs/gather.h
@@ -17,16 +17,13 @@ limitations under the License. */
 #include <cstring>
 #include <vector>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
  * A thin wrapper for gathering on cpu tensor
@@ -36,22 +33,23 @@ using framework::Tensor;
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
-               const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
+void CPUGather(const phi::CPUContext& ctx,
+               const DenseTensor& src,
+               const DenseTensor& index,
+               DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
     PADDLE_ENFORCE_EQ(
-        index.dims()[1], 1,
-        platform::errors::InvalidArgument(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument(
             "index.dims()[1] should be 1 when index.dims().size() = 2"
             "in gather_op, but received value is [%d].",
             index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in gather_op,"
                           "but received shape's size is [%d].",
                           index.dims().size()));
@@ -74,29 +72,32 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
 
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
-    PADDLE_ENFORCE_LT(p_index[i], input_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(p_index[i],
+                      input_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_size, p_index[i], i));
-    PADDLE_ENFORCE_GE(p_index[i], 0,
-                      platform::errors::OutOfRange(
+                          input_size,
+                          p_index[i],
+                          i));
+    PADDLE_ENFORCE_GE(p_index[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          p_index[i], i));
+                          p_index[i],
+                          i));
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename IndexT = int>
-void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
-                 const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU."));
-
+void CPUGatherNd(const phi::CPUContext& ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& index,
+                 DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
   auto input_dims = input.dims();
@@ -124,25 +125,30 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_LT(
-          index_value, input_dims[j],
-          platform::errors::InvalidArgument(
+          index_value,
+          input_dims[j],
+          phi::errors::InvalidArgument(
               "Input(index[-1)] has wrong value, it is [%d]", index_value));
       PADDLE_ENFORCE_GE(
-          index_value, 0,
-          platform::errors::InvalidArgument(
+          index_value,
+          0,
+          phi::errors::InvalidArgument(
               "The value of Input(index) must be no less than 0"));
 
       index_ += (index_value * temp);
       temp *= input_dims[j];
     }
-    memcpy(p_output + i * slice_size, p_input + index_ * slice_size,
-           slice_bytes);
+    memcpy(
+        p_output + i * slice_size, p_input + index_ * slice_size, slice_bytes);
   }
 }
 
 template <typename T, typename U>
-void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
-                      Tensor* out, const paddle::platform::Place& place) {
+void GatherV2Function(const phi::CPUContext& ctx,
+                      const DenseTensor* input,
+                      const DenseTensor* index,
+                      int axis,
+                      DenseTensor* out) {
   auto* index_data = index->data<U>();
   int64_t index_size = index->numel();
   int64_t input_size = input->numel();
@@ -154,18 +160,23 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 
   int64_t input_index_dim_size = input_dim[axis_index];
   for (int64_t i = 0; i < index_size; i++) {
-    PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_data[i],
+                      input_index_dim_size,
+                      phi::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
-                          input_index_dim_size, index_data[i], i));
-    PADDLE_ENFORCE_GE(index_data[i], 0,
-                      platform::errors::OutOfRange(
+                          input_index_dim_size,
+                          index_data[i],
+                          i));
+    PADDLE_ENFORCE_GE(index_data[i],
+                      0,
+                      phi::errors::OutOfRange(
                           "The element of Index must be greater than or equal "
                           "to 0, but received index element which is %d in the "
                           "%d index.",
-                          index_data[i], i));
+                          index_data[i],
+                          i));
   }
 
   int64_t inner_dim_size = 1;
@@ -184,7 +195,7 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
   auto out_dim = phi::make_ddim(out_dim_vec);
 
   out->Resize(out_dim);
-  auto* out_data = out->mutable_data<T>(place);
+  auto* out_data = ctx.Alloc<T>(out);
 
   int out_index = 0;
   for (int64_t i = 0; i < inner_dim_size; i++) {
@@ -200,9 +211,11 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
 }
 
 template <typename T, typename U>
-void GatherV2GradFunction(const Tensor* input, const Tensor* index,
-                          const int axis, Tensor* out,
-                          const paddle::platform::Place& place) {
+void GatherV2GradFunction(const phi::CPUContext& ctx,
+                          const DenseTensor* input,
+                          const DenseTensor* index,
+                          const int axis,
+                          DenseTensor* out) {
   auto* index_data = index->data<U>();
 
   auto input_dim = input->dims();
@@ -222,11 +235,10 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
     outer_dim_size *= input_dim[i];
   }
 
-  auto* out_data = out->mutable_data<T>(place);
-  auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+  auto* out_data = ctx.Alloc<T>(out);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  phi::funcs::set_constant(*dev_ctx, out, 0.0);
+  phi::funcs::set_constant(ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
@@ -239,5 +251,5 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
similarity index 67%
rename from paddle/fluid/operators/scatter.cu.h
rename to paddle/phi/kernels/funcs/scatter.cu.h
index 2fea08516d386..f87e8c882c432 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -15,20 +15,19 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
-                                      size_t index_size, size_t slice_size) {
+__global__ void ScatterInitCUDAKernel(const IndexT* indices,
+                                      T* output,
+                                      size_t index_size,
+                                      size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -47,9 +46,12 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
-                                  T* output, size_t index_size,
-                                  size_t slice_size, bool overwrite) {
+__global__ void ScatterCUDAKernel(const T* params,
+                                  const IndexT* indices,
+                                  T* output,
+                                  size_t index_size,
+                                  size_t slice_size,
+                                  bool overwrite) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -72,9 +74,12 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
-                                    T* output, const int64_t* output_dims,
-                                    size_t remain_size, size_t slice_size,
+__global__ void ScatterNdCUDAKernel(const T* update,
+                                    const IndexT* indices,
+                                    T* output,
+                                    const int64_t* output_dims,
+                                    size_t remain_size,
+                                    size_t slice_size,
                                     size_t end_size) {
   CUDA_KERNEL_LOOP_TYPE(i, remain_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
@@ -90,7 +95,8 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
           "be less than [%d] and greater or equal to 0, but received [%d]",
-          output_dims[j], index_value);
+          output_dims[j],
+          index_value);
 
       gather_i += (index_value * temp);
       temp *= output_dims[j];
@@ -109,21 +115,24 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void GPUScatterAssign(const framework::ExecutionContext& context,
-                      const Tensor& src, const Tensor& index, Tensor* output,
+void GPUScatterAssign(const phi::GPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output,
                       bool overwrite = true) {
   // check index of shape 1-D
-  const auto& ctx = context.device_context();
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() = 2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() = 2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -131,7 +140,7 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
-  framework::DDim output_dims(src_dims);
+  phi::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
   // slice size
@@ -150,23 +159,20 @@ void GPUScatterAssign(const framework::ExecutionContext& context,
 
   // if not overwrite mode, init data
   if (!overwrite) {
-    ScatterInitCUDAKernel<T, IndexT><<<
-        grid, block, 0,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+    ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_index, p_output, index_size, slice_size);
   }
 
-  ScatterCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_src, p_index, p_output, index_size, slice_size, overwrite);
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void GPUScatterGradForX(const phi::GPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   // slice size
@@ -181,21 +187,18 @@ void GPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
   int64_t n = slice_size * index_size;
   int64_t height = (n + block - 1) / block;
 
-  int64_t max_grid_dimx =
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx)
-          .GetCUDAMaxGridDimSize()[0];
+  int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
   int64_t grid = height < max_grid_dimx ? height : max_grid_dimx;
 
-  ScatterInitCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+  ScatterInitCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
       p_index, p_output, index_size, slice_size);
 }
 
-template <typename DeviceContext, typename T, typename IndexT = int>
-void GPUScatterNdAdd(const framework::ExecutionContext& context,
-                     const Tensor& update, const Tensor& index,
-                     Tensor* output) {
+template <typename T, typename IndexT = int>
+void GPUScatterNdAdd(const phi::GPUContext& ctx,
+                     const DenseTensor& update,
+                     const DenseTensor& index,
+                     DenseTensor* output) {
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
 
@@ -219,31 +222,34 @@ void GPUScatterNdAdd(const framework::ExecutionContext& context,
   const size_t slice_bytes = slice_size * sizeof(T);
   // put output_dims int CUDA
   // gplace and cplace
-  const auto& ctx = context.template device_context<DeviceContext>();
   const auto gplace = ctx.GetPlace();
-  auto cplace = platform::CPUPlace();
+  auto cplace = phi::CPUPlace();
 
   std::vector<int64_t> v_output_dims(output_dims_size);
   for (int i = 0; i < output_dims_size; ++i) {
     v_output_dims[i] = output_dims[i];
   }
-  auto& dev_ctx = context.cuda_device_context();
+
+  phi::DenseTensor out_dims_tensor;
+  out_dims_tensor.Resize({output_dims_size});
+  auto* g_output_dims = ctx.Alloc<int64_t>(&out_dims_tensor);
   int64_t bytes = output_dims_size * sizeof(int64_t);
-  auto output_dims_ptr = memory::Alloc(dev_ctx, bytes);
-  int64_t* g_output_dims = reinterpret_cast<int64_t*>(output_dims_ptr->ptr());
-  memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes,
-               ctx.stream());
+  paddle::memory::Copy(
+      gplace, g_output_dims, cplace, v_output_dims.data(), bytes, ctx.stream());
 
   int block = 512;
   int64_t n = slice_size * remain_numel;
   int64_t grid = (n + block - 1) / block;
 
-  ScatterNdCUDAKernel<T, IndexT><<<
-      grid, block, 0,
-      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_update, p_index, p_output, g_output_dims, remain_numel, slice_size,
+  ScatterNdCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
+      p_update,
+      p_index,
+      p_output,
+      g_output_dims,
+      remain_numel,
+      slice_size,
       end_size);
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/scatter.h b/paddle/phi/kernels/funcs/scatter.h
similarity index 65%
rename from paddle/fluid/operators/scatter.h
rename to paddle/phi/kernels/funcs/scatter.h
index eae82fcd01baa..5d15c955a7f21 100644
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/phi/kernels/funcs/scatter.h
@@ -15,18 +15,16 @@ limitations under the License. */
 #pragma once
 #include <cstring>
 #include <string>
+#include <unordered_set>
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "unordered_set"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
+namespace phi {
+namespace funcs {
 
 /**
   * Return the updated array pointer, use blas or eigen lib to optimize time
@@ -34,24 +32,31 @@ using Tensor = framework::Tensor;
  */
 template <typename T, typename IndexT = int>
 typename std::enable_if<std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
-  blas.VADD(slice_size, src_pointer + src_index * slice_size,
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(ctx);
+  blas.VADD(slice_size,
+            src_pointer + src_index * slice_size,
             dst_pointer + dst_index * slice_size,
             dst_pointer + dst_index * slice_size);
 }
 
 template <typename T, typename IndexT = int>
 typename std::enable_if<!std::is_floating_point<T>::value>::type
-elementwise_inner_add(const framework::ExecutionContext& ctx,
-                      const T* src_pointer, T* dst_pointer, size_t src_index,
-                      IndexT dst_index, size_t slice_size) {
-  using EigenVector = typename framework::EigenTensor<T, 1>::Type;
-  using ConstEigenVector = typename framework::EigenTensor<T, 1>::ConstType;
-
-  framework::EigenDim<1>::Type dim;
+elementwise_inner_add(const phi::CPUContext& ctx,
+                      const T* src_pointer,
+                      T* dst_pointer,
+                      size_t src_index,
+                      IndexT dst_index,
+                      size_t slice_size) {
+  using EigenVector = typename phi::EigenTensor<T, 1>::Type;
+  using ConstEigenVector = typename phi::EigenTensor<T, 1>::ConstType;
+
+  phi::EigenDim<1>::Type dim;
   dim[0] = slice_size;
 
   ConstEigenVector eigen_src(src_pointer + src_index * slice_size, dim);
@@ -67,22 +72,23 @@ elementwise_inner_add(const framework::ExecutionContext& ctx,
  * return: output tensor
  */
 template <typename T, typename IndexT = int>
-void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
-                   const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssign(const phi::CPUContext& ctx,
+                   const DenseTensor& src,
+                   const DenseTensor& index,
+                   DenseTensor* output) {
   // check index of shape 1-D
   if (index.dims().size() == 2) {
-    PADDLE_ENFORCE_EQ(index.dims()[1], 1,
-                      platform::errors::InvalidArgument(
-                          "index.dims()[1] should be 1 when "
-                          "index.dims().size() =2 in scatter_op."
-                          "But received value is [%d]",
-                          index.dims()[1]));
+    PADDLE_ENFORCE_EQ(
+        index.dims()[1],
+        1,
+        phi::errors::InvalidArgument("index.dims()[1] should be 1 when "
+                                     "index.dims().size() =2 in scatter_op."
+                                     "But received value is [%d]",
+                                     index.dims()[1]));
   } else {
-    PADDLE_ENFORCE_EQ(index.dims().size(), 1,
-                      platform::errors::InvalidArgument(
+    PADDLE_ENFORCE_EQ(index.dims().size(),
+                      1,
+                      phi::errors::InvalidArgument(
                           "index.dims().size() should be 1 or 2 in scatter_op."
                           "But received value is [%d]",
                           index.dims().size()));
@@ -99,12 +105,16 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -115,8 +125,9 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
 
-    PADDLE_ENFORCE_GE(index_, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
@@ -128,20 +139,20 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
-                      const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("This kernel only runs on CPU."));
+void ScatterAssignAdd(const phi::CPUContext& ctx,
+                      const DenseTensor& src,
+                      const DenseTensor& index,
+                      DenseTensor* output) {
   // check index of shape 1-D
   PADDLE_ENFORCE_EQ(
       index.dims().size() == 1 ||
           (index.dims().size() == 2 && index.dims()[1] == 1),
-      true, platform::errors::InvalidArgument(
-                "index's shape is error, "
-                "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
-                "but got index'dims shape is %d",
-                index.dims().size()));
+      true,
+      phi::errors::InvalidArgument(
+          "index's shape is error, "
+          "expect index'dims shape is 1 or 2 and index.dims[1] is 1"
+          "but got index'dims shape is %d",
+          index.dims().size()));
   int64_t index_size = index.dims()[0];
 
   auto src_dims = src.dims();
@@ -155,12 +166,16 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE_EQ(
-        src_dims[i], dst_dims[i],
-        platform::errors::InvalidArgument(
+        src_dims[i],
+        dst_dims[i],
+        phi::errors::InvalidArgument(
             "The dimensions of the source tensor and target tensor should"
             " match, but received source tensor's %d-th dimension is %d,"
             "target tensor's %d-th dimension is %d.",
-            i, src_dims[i], i, dst_dims[i]));
+            i,
+            src_dims[i],
+            i,
+            dst_dims[i]));
 
   // slice size
   size_t slice_size = 1;
@@ -172,36 +187,40 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src,
   auto max_index = dst_dims[0];
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    PADDLE_ENFORCE_GE(index_val, 0,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_GE(index_val,
+                      0,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be greater than or equal to 0, but received [%d]",
                           index_val));
-    PADDLE_ENFORCE_LT(index_val, max_index,
-                      platform::errors::OutOfRange(
+    PADDLE_ENFORCE_LT(index_val,
+                      max_index,
+                      phi::errors::OutOfRange(
                           "The index is out of bounds, "
                           "please check whether the dimensions of index and "
                           "input meet the requirements. It should "
                           "be less than %d, but received %d",
-                          max_index, index_val));
+                          max_index,
+                          index_val));
     memset(p_output + slice_size * index_val, 0, slice_bytes);
   }
 
   // if not in overwrite mode, need to init output data
   for (int64_t i = 0; i < index_size; ++i) {
     const IndexT& index_val = p_index[i];
-    elementwise_inner_add<T, IndexT>(ctx, p_src, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_src, p_output, i, index_val, slice_size);
   }
 }
 
 // The function is only for scatter grad x,
 // however update grad use gather
 template <typename T, typename IndexT = int>
-void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
-                        Tensor* output) {
+void CPUScatterGradForX(const phi::CPUContext& ctx,
+                        const DenseTensor& index,
+                        DenseTensor* output) {
   int64_t index_size = index.dims()[0];
   auto dst_dims = output->dims();
   const IndexT* p_index = index.data<IndexT>();
@@ -216,12 +235,10 @@ void CPUScatterGradForX(const platform::DeviceContext& ctx, const Tensor& index,
 }
 
 template <typename T, typename IndexT = int>
-void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
-                  const Tensor& index, Tensor* output) {
-  PADDLE_ENFORCE_EQ(
-      platform::is_cpu_place(ctx.device_context().GetPlace()), true,
-      platform::errors::PreconditionNotMet("It should be running on the CPU"));
-
+void ScatterNdAdd(const phi::CPUContext& ctx,
+                  const DenseTensor& update,
+                  const DenseTensor& index,
+                  DenseTensor* output) {
   // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
   auto index_dims = index.dims();
   auto index_dims_size = index_dims.size();
@@ -250,21 +267,23 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update,
     for (int64_t j = end_size - 1; j >= 0; --j) {
       IndexT index_value = p_index[i * end_size + j];
       PADDLE_ENFORCE_EQ(
-          (index_value >= 0 && index_value < output_dims[j]), true,
-          platform::errors::OutOfRange(
+          (index_value >= 0 && index_value < output_dims[j]),
+          true,
+          phi::errors::OutOfRange(
               "The index is out of bounds, "
               "please check whether the dimensions of index and "
               "input meet the requirements. It should "
               "be less than [%d] and greater or equal to 0, but received [%d]",
-              output_dims[j], index_value));
+              output_dims[j],
+              index_value));
 
       index_val += (index_value * temp);
       temp *= output_dims[j];
     }
-    elementwise_inner_add<T, IndexT>(ctx, p_update, p_output, i, index_val,
-                                     slice_size);
+    elementwise_inner_add<T, IndexT>(
+        ctx, p_update, p_output, i, index_val, slice_size);
   }
 }
 
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi

From 1980e33a901efa5128e7799a83bcd35ee8ada199 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 2 Mar 2022 18:54:54 +0800
Subject: [PATCH 24/41] add check for backward hook (#40041)

* add check for backward hook

* refine ut
---
 paddle/fluid/imperative/basic_engine.cc       |  1 +
 .../fluid/imperative/gradient_accumulator.cc  |  1 +
 .../fluid/imperative/gradient_accumulator.h   | 24 ++++++++++++
 .../test_imperative_auto_mixed_precision.py   | 38 ++++++++++++++++++-
 4 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 8373c7fe50d02..7416d206fc43e 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -317,6 +317,7 @@ static std::shared_ptr<NameVarMap<VariableWrapper>> CallGradientHooks(
         auto tmp_var = var;
         for (const auto& hook_pair : var->GetVariableWrapperHooks()) {
           tmp_var = (*hook_pair.second)(tmp_var);
+          CheckVar(var, tmp_var);
         }
         (*tmp_ins_ptr)[pair.first][i] = tmp_var;
       }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 0abc5ad90e269..12aa13bbacc3b 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -732,6 +732,7 @@ void GradientAccumulator::CallGradientHooks() {
             << var_->GetVariableWrapperHooks().size();
     for (const auto& hook_pair : var_->GetVariableWrapperHooks()) {
       tmp_var = (*hook_pair.second)(tmp_var);
+      CheckVar(inner_var_, tmp_var);
     }
     inner_var_ = tmp_var;
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h
index e74711c2a7965..03f6775defc2f 100644
--- a/paddle/fluid/imperative/gradient_accumulator.h
+++ b/paddle/fluid/imperative/gradient_accumulator.h
@@ -179,5 +179,29 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 template <typename VarType>
 void TensorAdd(const VarType& src, VarType* dst);
 
+inline void CheckVar(const std::shared_ptr<VariableWrapper>& pre,
+                     const std::shared_ptr<VariableWrapper>& post) {
+  if (pre->IsEmpty() && !post->IsEmpty()) {
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "The tensor(%s) in before and after hook are not consistent",
+        pre->Name()));
+  }
+  if (!pre->IsEmpty() && !post->IsEmpty()) {
+    VLOG(4) << pre->DataType() << " " << post->DataType();
+    PADDLE_ENFORCE_EQ(
+        pre->DataType(), post->DataType(),
+        platform::errors::PermissionDenied(
+            "The dtype of tensor(%s) before(%s) and after(%s) hook are not "
+            "consistent",
+            pre->Name(), framework::DataTypeToString(pre->DataType()),
+            framework::DataTypeToString(post->DataType())));
+    PADDLE_ENFORCE_EQ(pre->Place(), post->Place(),
+                      platform::errors::PermissionDenied(
+                          "The place of tensor(%s) before(%s) and after(%s) "
+                          "hook are not consistent",
+                          pre->Name(), pre->Place(), post->Place()));
+  }
+}
+
 }  // namespace imperative
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 5cb72512f99af..2011a35db682e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -1156,7 +1156,7 @@ def test_bf16(self):
                         out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1))
 
 
-class TestPyLayerWithAmp(unittest.TestCase):
+class TestAmpWithPyLyer(unittest.TestCase):
     def test_pylayer(self):
         class MyMM(PyLayer):
             @staticmethod
@@ -1168,7 +1168,7 @@ def forward(ctx, a, b):
             def backward(ctx, grad):
                 a, b = ctx.saved_tensor()
                 # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast()
-                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent
+                # thus, the mm operation raise errors because of the dtype of inputs are inconsistent before.
                 return grad.mm(b.t()), a.t().mm(grad)
 
         x = paddle.rand([10, 10])
@@ -1182,5 +1182,39 @@ def backward(ctx, grad):
         loss.backward()
 
 
+class TestAmpWithHook(unittest.TestCase):
+    def test_hook_change_dtype(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                print('grad', grad, grad.dtype)  # grad's dtype is float32
+                res = paddle.mm(grad, grad)  # mm runs in fp16
+                print('res', res, res.dtype)  # res's dtype is float16
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+    def test_hook_change_place(self):
+        with paddle.fluid.dygraph.guard():
+            v = paddle.rand([3, 3])
+            v.stop_gradient = False
+
+            def foo(grad):
+                res = grad.cpu()  # change place
+                return res
+
+            v.register_hook(foo)
+            with paddle.amp.auto_cast():
+                a = paddle.mm(v, v)
+                loss = a.sum()
+                self.assertRaises(RuntimeError, loss.backward)
+
+
 if __name__ == '__main__':
     unittest.main()

From 7ef617892de37c82c02d6476a7824ceba7a4f100 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 2 Mar 2022 19:19:57 +0800
Subject: [PATCH 25/41] [IPU] update dockerfile (#40061)

* update dockerfile for ipu

* update comments, test=document_fix
---
 tools/dockerfile/Dockerfile.ipu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 68bd841c37509..715bd34b908be 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -1,11 +1,12 @@
 # A image for building paddle binaries
-# docker build -f Dockerfile.ipu -t paddlepaddle/paddle:latest-ipu-dev .
 
-# /usr/bin/docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK \
-# --device=/dev/infiniband/ --ipc=host --name paddle-with-dev -v $PWD:/paddle \
-# -it paddlepaddle/paddle:latest-ipu-dev /bin/bash
+# build docker image
+# docker build -t paddlepaddle/paddle:ipu-dev-2.3.0 -f tools/dockerfile/Dockerfile.ipu .
 
-FROM graphcore/poplar:latest
+# run a container
+# docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:ipu-dev-2.3.0 bash
+
+FROM graphcore/poplar:2.3.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables

From ebc6959c9becf0bf0eab6eb260da45b96afc80ea Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Wed, 2 Mar 2022 19:35:10 +0800
Subject: [PATCH 26/41] modify infershape of yolo_box (#40056)

* modify infershape of yolo_box
---
 paddle/fluid/operators/detection/yolo_box_op.cc | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 48b0d511d902c..511d8e0eed106 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -102,7 +102,12 @@ class YoloBoxOp : public framework::OperatorWithKernel {
                           "But received class_num (%s)",
                           class_num));
 
-    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    int box_num;
+    if ((dim_x[2] > 0 && dim_x[3] > 0) || ctx->IsRuntime()) {
+      box_num = dim_x[2] * dim_x[3] * anchor_num;
+    } else {
+      box_num = -1;
+    }
     std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
     ctx->SetOutputDim("Boxes", phi::make_ddim(dim_boxes));
 

From 3fc698fb16998305697cc22bbb5c49369681b9fe Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 2 Mar 2022 19:37:43 +0800
Subject: [PATCH 27/41] Adjust GPU Arches for next level Whl release strategy
 (#39910)

* Adjust GPU Arches for Whl releases

* Adjusted CUDA arches

* fixed minor issue

* adjusted gpu arches
---
 CMakeLists.txt   |  3 ++-
 cmake/cuda.cmake | 20 +++++++++++++-------
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b499fb43ab99..4c5f711d2918b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,7 +238,8 @@ option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
 option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_CUBIN   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
+option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
 option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
 option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
 option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 34c079ba71cf8..312a030524468 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,16 +6,22 @@ if(WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
-elseif(NEW_RELEASE_CUBIN)
+elseif(NEW_RELEASE_ALL)
+  message("Using New Release Strategy - All Arches Packge")
+  add_definitions(-DNEW_RELEASE_ALL)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
+  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
-  add_definitions(-DNEW_RELEASE_CUBIN)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
-  set(paddle_known_gpu_archs10 "50 60 70 75")
-  set(paddle_known_gpu_archs11 "60 70 75 80")
+  add_definitions(-DNEW_RELEASE_PYPI)
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs10 "")
+  set(paddle_known_gpu_archs11 "60 61 70 75 80")
 elseif(NEW_RELEASE_JIT)
   message("Using New Release Strategy - JIT Packge")
   add_definitions(-DNEW_RELEASE_JIT)
-  set(paddle_known_gpu_archs "35 37 50 52 60 61 70 75 80 86")
+  set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
   set(paddle_known_gpu_archs10 "35 50 60 70 75")
   set(paddle_known_gpu_archs11 "35 50 60 70 75 80")
 else()
@@ -148,7 +154,7 @@ function(select_nvcc_arch_flags out_variable)
 
   # remove dots and convert to lists
   string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
-  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${cuda_arch_ptx}")
   string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
   string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
 

From f3d54e2eaa668a04c230cab2291e4b222daed4b9 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Wed, 2 Mar 2022 20:49:28 +0800
Subject: [PATCH 28/41] Move sgd to phi (#40045)

* move sgd to phi; test=develop

* update

* add sgd kernel; test=develop
---
 paddle/fluid/framework/operator.cc            |   6 +-
 .../operators/optimizers/dgc_momentum_op.h    |  65 +++++-
 paddle/fluid/operators/optimizers/sgd_op.cc   |   5 -
 paddle/fluid/operators/optimizers/sgd_op.cu   |   7 -
 paddle/phi/core/kernel_registry.h             |   6 +
 paddle/phi/core/kernel_utils.h                |   1 +
 paddle/phi/kernels/cpu/sgd_kernel.cc          | 213 ++++++++++++++++++
 paddle/phi/kernels/gpu/sgd_kernel.cu          | 209 +++++++++++++++++
 paddle/phi/kernels/sgd_kernel.h               |  54 +++++
 paddle/phi/ops/compat/sgd_sig.cc              |  44 ++++
 10 files changed, 592 insertions(+), 18 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/sgd_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/sgd_kernel.cu
 create mode 100644 paddle/phi/kernels/sgd_kernel.h
 create mode 100644 paddle/phi/ops/compat/sgd_sig.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ffdc3e6d3c2bc..6414dd455db4f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2051,7 +2051,11 @@ void OperatorWithKernel::BuildPhiKernelContext(
     // deal with optional here
     if ((it == ctx.inputs.end() || it->second.size() == 0) &&
         (input_defs[i].type_index ==
-         std::type_index(typeid(paddle::optional<const phi::DenseTensor&>)))) {
+             std::type_index(
+                 typeid(paddle::optional<const phi::DenseTensor&>)) ||
+         input_defs[i].type_index ==
+             std::type_index(
+                 typeid(paddle::optional<const phi::SelectedRows&>)))) {
       pt_kernel_context->EmplaceBackInputWithoutSetRange(nullptr);
       auto end_idx = start_idx + 1;
       pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx),
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
index bea019f1f36e2..c86f544ed77ff 100644
--- a/paddle/fluid/operators/optimizers/dgc_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -17,7 +17,7 @@
 #include <memory>
 
 #include "paddle/fluid/operators/optimizers/momentum_op.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
+#include "paddle/phi/kernels/sgd_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -26,8 +26,7 @@ template <typename DeviceContext, typename T>
 class DGCMomentumKernel : public framework::OpKernel<T> {
  public:
   DGCMomentumKernel()
-      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()),
-        _sgd_op_kernel(new SGDOpKernel<DeviceContext, T>()) {}
+      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()) {}
 
   void Compute(const framework::ExecutionContext& context) const override {
     auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
@@ -67,12 +66,68 @@ class DGCMomentumKernel : public framework::OpKernel<T> {
     }
 
     VLOG(10) << " so use sgd optimizer";
-    return _sgd_op_kernel->Compute(context);
+
+    const auto* param_var = context.InputVar("Param");
+    const auto* grad_var = context.InputVar("Grad");
+    auto* learning_rate = context.Input<framework::Tensor>("LearningRate");
+    bool multi_precision = context.Attr<bool>("multi_precision");
+    if (param_var->IsType<framework::LoDTensor>()) {
+      auto* param = context.Input<framework::Tensor>("Param");
+      auto* param_out = context.Output<framework::Tensor>("ParamOut");
+      auto* master_param_out =
+          context.Output<framework::Tensor>("MasterParamOut");
+      paddle::optional<const framework::Tensor&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<framework::Tensor>("MasterParam");
+        master_param_opt = *master_param;
+      }
+
+      if (grad_var->IsType<framework::Tensor>()) {
+        // sgd_dense
+        auto* grad = context.Input<framework::Tensor>("Grad");
+        phi::SGDDenseKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      } else {
+        // sgd dense param sparse grad
+        auto* grad = context.Input<phi::SelectedRows>("Grad");
+        phi::SGDDenseParamSparseGradKernel<T>(
+            static_cast<const typename framework::ConvertToPhiContext<
+                DeviceContext>::TYPE&>(dev_ctx),
+            *param, *learning_rate, *grad, master_param_opt, multi_precision,
+            param_out, master_param_out);
+      }
+    } else if (param_var->IsType<phi::SelectedRows>() &&
+               grad_var->IsType<phi::SelectedRows>() &&
+               platform::is_cpu_place(context.GetPlace())) {
+      // sgd sparse param sparse grad
+      auto* param = context.Input<phi::SelectedRows>("Param");
+      auto* param_out = context.Output<phi::SelectedRows>("ParamOut");
+      auto* master_param_out =
+          context.Output<phi::SelectedRows>("MasterParamOut");
+      paddle::optional<const phi::SelectedRows&> master_param_opt =
+          paddle::none;
+      if (multi_precision) {
+        auto* master_param = context.Input<phi::SelectedRows>("MasterParam");
+        master_param_opt = *master_param;
+      }
+      auto* grad = context.Input<phi::SelectedRows>("Grad");
+      phi::SGDSparseParamSparseGradKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *param, *learning_rate, *grad, master_param_opt, multi_precision,
+          param_out, master_param_out);
+
+    } else {
+      PADDLE_THROW("gdc not support yet");
+    }
   }
 
  private:
   std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
-  std::unique_ptr<SGDOpKernel<DeviceContext, T>> _sgd_op_kernel;
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 529d60a2820ea..0e3f895d276af 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -166,8 +166,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::SGDOpInferVarType);
-REGISTER_OP_CPU_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 3149f5f56ed49..222244a2fd1e3 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -166,10 +166,3 @@ class SGDOpKernel<platform::CUDADeviceContext, T>
 };
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 7a05452cbebe0..2b04d173af069 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -81,6 +81,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 paddle::optional<const SelectedRows&>))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type ==
                  std::type_index(typeid(const std::vector<DenseTensor>&))) {
         args_def->AppendInput(default_key.backend(),
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index e5de5e2b49ebb..b582375155a18 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -219,6 +219,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
 
diff --git a/paddle/phi/kernels/cpu/sgd_kernel.cc b/paddle/phi/kernels/cpu/sgd_kernel.cc
new file mode 100644
index 0000000000000..c7b4074c70aaa
--- /dev/null
+++ b/paddle/phi/kernels/cpu/sgd_kernel.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+#include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+void sgd_dense_param_dense_grad_impl(const DenseTensor& param,
+                                     const DenseTensor& learning_rate,
+                                     const DenseTensor& grad,
+                                     DenseTensor* param_out) {
+  const auto sz = param_out->numel();
+  paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
+  const T* lr = learning_rate.data<T>();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad.data<T>();
+  int64_t rows_idx = 0;
+  T* out_data = param_out->data<T>();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, &rows_idx, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_dense_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const DenseTensor& grad,
+    DenseTensor* param_out) {
+  auto p = EigenVector<phi::dtype::bfloat16>::Flatten(param);
+  auto g = EigenVector<phi::dtype::bfloat16>::Flatten(grad);
+  auto o = EigenVector<phi::dtype::bfloat16>::Flatten(*param_out);
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  o = p - lr[0] * g;
+}
+
+template <typename T>
+void sgd_dense_param_sparse_grad_impl(const DenseTensor& param,
+                                      const DenseTensor& learning_rate,
+                                      const SelectedRows& grad,
+                                      DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const T* param_data = param.data<T>();
+  const T* grad_data = grad_value.data<T>();
+  const T* lr = learning_rate.data<T>();
+  const int64_t* rows_data = grad_rows.data();
+  T* out_data = param_out->data<T>();
+
+  paddle::operators::jit::sgd_attr_t attr;
+  attr.param_height = param_out->dims()[0];
+  attr.param_width = param_out->numel() / attr.param_height;
+  attr.grad_height = grad_rows.size();  // note: it is not grad->height()
+  attr.grad_width = grad_value.numel() / attr.grad_height;
+  attr.selected_rows_size = grad_rows.size();
+
+  auto sgd =
+      paddle::operators::jit::KernelFuncs<paddle::operators::jit::SgdTuple<T>,
+                                          phi::CPUPlace>::Cache()
+          .At(attr);
+  sgd(lr, param_data, grad_data, rows_data, out_data, &attr);
+}
+
+template <>
+void sgd_dense_param_sparse_grad_impl<phi::dtype::bfloat16>(
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    DenseTensor* param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const auto grad_height = grad.height();
+  const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+  const auto grad_width = grad_value.numel() / grad_val_height;
+
+  const auto* grad_data = grad_value.data<phi::dtype::bfloat16>();
+  auto* out_data = param_out->data<phi::dtype::bfloat16>();
+  const auto* lr = learning_rate.data<phi::dtype::bfloat16>();
+
+  for (size_t i = 0; i < grad_rows.size(); ++i) {
+    PADDLE_ENFORCE_LT(
+        grad_rows[i],
+        grad_height,
+        phi::errors::OutOfRange(
+            "Grad rows index value should be less than grad height."
+            "Got [%s], but expected less than [%s]",
+            grad_rows[i],
+            grad_height));
+    const int64_t row = grad_rows[i];
+    for (int64_t j = 0; j < grad_width; ++j) {
+      out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_dense_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  dev_ctx.template Alloc<T>(param_out);
+  sgd_dense_param_sparse_grad_impl<T>(param, learning_rate, grad, param_out);
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  // for distributed training, a sparse var may be empty,
+  // just skip updating.
+  if (grad.rows().size() == 0) {
+    return;
+  }
+
+  auto param_row_width = param.value().dims()[1];
+  auto grad_row_width = grad.value().dims()[1];
+  PADDLE_ENFORCE_EQ(
+      param_row_width,
+      grad_row_width,
+      phi::errors::InvalidArgument(
+          "The param_row in SgdOP should have the same size with grad_row. "
+          "But received param_row's width is [%s], and grad_row's width is "
+          "[%s]",
+          param_row_width,
+          grad_row_width));
+
+  const auto* lr = learning_rate.data<T>();
+  const auto* grad_data = grad.value().data<T>();
+  auto* out_data = param_out->mutable_value()->data<T>();
+  for (size_t i = 0; i < grad.rows().size(); i++) {
+    int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
+    PADDLE_ENFORCE_GE(
+        id_index,
+        static_cast<int64_t>(0),
+        phi::errors::InvalidArgument(
+            "The id in SgdOp should be >= 0. But recevied id_index is [%s]",
+            id_index));
+    for (int64_t j = 0; j < grad_row_width; j++) {
+      out_data[id_index * grad_row_width + j] -=
+          lr[0] * grad_data[i * grad_row_width + j];
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
new file mode 100644
index 0000000000000..7dd5a03383fd2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -0,0 +1,209 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_helper.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename MT>
+__global__ void SGDKernelMT(const T* param,
+                            const T* grad,
+                            const T* learning_rate,
+                            const int num,
+                            T* param_out,
+                            const MT* master_param,
+                            MT* master_param_out) {
+  MT lr = static_cast<MT>(learning_rate[0]);
+  CUDA_KERNEL_LOOP(i, num) {
+    MT p_data = master_param ? master_param[i] : static_cast<MT>(param[i]);
+    MT g_data = static_cast<MT>(grad[i]);
+    p_data = p_data - lr * g_data;
+    param_out[i] = static_cast<T>(p_data);
+    if (master_param_out) {
+      master_param_out[i] = p_data;
+    }
+  }
+}
+
+template <typename T>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate,
+                                       T* tensor_out,
+                                       int64_t row_numel,
+                                       int64_t limit) {
+  for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) {
+    const T* selected_rows_ptr = selected_rows + i * row_numel;
+    T* tensor_out_ptr = tensor_out + rows[i] * row_numel;
+    for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
+      // Since index in rows of SelectedRows can be duplicate, we have to use
+      // Atomic Operation to avoid concurrent write error.
+      paddle::platform::CudaAtomicAdd(
+          tensor_out_ptr + index,
+          -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  int block = 512;
+  int grid = (param.numel() + block - 1) / block;
+
+  SGDKernelMT<T, MPDType><<<grid, block, 0, dev_ctx.stream()>>>(
+      param.data<T>(),
+      grad.data<T>(),
+      learning_rate.data<T>(),
+      param.numel(),
+      param_out->mutable_data<T>(dev_ctx.GetPlace()),
+      master_in_data,
+      master_out_data);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  using MPDType = typename paddle::operators::details::MPTypeTrait<T>::Type;
+  // do some check here
+  // if (multi_precision) {
+  //   bool has_master =
+  //       ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+
+  // }
+  const MPDType* master_in_data =
+      multi_precision ? master_param->data<MPDType>() : nullptr;
+  MPDType* master_out_data =
+      multi_precision
+          ? master_param_out->mutable_data<MPDType>(dev_ctx.GetPlace())
+          : nullptr;
+
+  PADDLE_ENFORCE_EQ(
+      &param,
+      param_out,
+      phi::errors::InvalidArgument(
+          "The input tensor Param of SgdOp should be equal with ParamOut "
+          "if variable's type is SelectedRows."));
+
+  auto in_height = grad.height();
+  auto out_dims = param_out->dims();
+  PADDLE_ENFORCE_EQ(in_height,
+                    out_dims[0],
+                    phi::errors::InvalidArgument(
+                        "The input tensor Grad's height of SgdOp should be "
+                        "equal with ParamOut's dims. But received Grad's "
+                        "height [%s] and ParamOut's dims [%s]",
+                        in_height,
+                        out_dims[0]));
+
+  auto& in_value = grad.value();
+  auto& in_rows = grad.rows();
+
+  int64_t in_row_numel = in_value.numel() / in_rows.size();
+  PADDLE_ENFORCE_EQ(in_row_numel,
+                    param_out->numel() / in_height,
+                    phi::errors::InvalidArgument(
+                        "The in_row_numel of SgdOp should be equal with "
+                        "param_out's numel / in_height."));
+
+  auto* in_data = in_value.data<T>();
+  auto* out_data = param_out->data<T>();
+
+  const int kThreadsPerBlock = 256;
+  int thread_x = kThreadsPerBlock;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+  paddle::framework::MixVector<int64_t> mixv_in_rows(&in_rows);
+  SparseSGDFunctorKernel<<<max_blocks, thread_x, 0, dev_ctx.stream()>>>(
+      in_data,
+      mixv_in_rows.CUDAData(dev_ctx.GetPlace()),
+      learning_rate.data<T>(),
+      out_data,
+      in_row_numel,
+      in_rows.size());
+}
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out) {
+  PADDLE_THROW("not impl");
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(sgd_sparse_param_sparse_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDSparseParamSparseGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/sgd_kernel.h b/paddle/phi/kernels/sgd_kernel.h
new file mode 100644
index 0000000000000..12361c738e247
--- /dev/null
+++ b/paddle/phi/kernels/sgd_kernel.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    paddle::optional<const DenseTensor&> master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const DenseTensor&> master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out);
+
+template <typename T, typename Context>
+void SGDSparseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const SelectedRows& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    paddle::optional<const SelectedRows&> master_param,
+    bool multi_precision,
+    SelectedRows* param_out,
+    SelectedRows* master_param_out);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/sgd_sig.cc b/paddle/phi/ops/compat/sgd_sig.cc
new file mode 100644
index 0000000000000..cdf1a221f7ec2
--- /dev/null
+++ b/paddle/phi/ops/compat/sgd_sig.cc
@@ -0,0 +1,44 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SGDOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("Grad")) {
+    return KernelSignature("sgd",
+                           {"Param", "LearningRate", "Grad", "MasterParam"},
+                           {"multi_precision"},
+                           {"ParamOut", "MasterParamOut"});
+  } else if (ctx.IsSelectedRowsInput("Grad")) {
+    if (ctx.IsDenseTensorInput("Param")) {
+      return KernelSignature("sgd_dense_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    } else {
+      return KernelSignature("sgd_sparse_param_sparse_grad",
+                             {"Param", "LearningRate", "Grad", "MasterParam"},
+                             {"multi_precision"},
+                             {"ParamOut", "MasterParamOut"});
+    }
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sgd, phi::SGDOpArgumentMapping);

From a8e02ef1b6c8f193304211706c118ea673129dbd Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 2 Mar 2022 20:52:19 +0800
Subject: [PATCH 29/41] [MLU] add mlu ci script (#39805)

* [MLU] add mlu ci script

* Update CMakeLists.txt
---
 .../operators/mlu/activation_op_mlu_test.cc   |   1 -
 .../fluid/operators/uniform_random_op_mlu.cc  |  46 +++++-
 paddle/scripts/paddle_build.sh                | 133 ++++++++++++++++++
 .../unittests/mlu/test_accuracy_op_mlu.py     |   3 +-
 .../unittests/mlu/test_batch_norm_op_mlu.py   |   2 +-
 .../mlu/test_batch_norm_op_mlu_v2.py          |   3 +-
 .../tests/unittests/mlu/test_cast_op_mlu.py   |  12 +-
 .../tests/unittests/mlu/test_concat_op_mlu.py |   2 +-
 .../mlu/test_elementwise_add_op_mlu.py        |   3 +-
 .../mlu/test_fill_constant_op_mlu.py          |   3 +-
 .../mlu/test_gaussian_random_op_mlu.py        |   3 +-
 .../unittests/mlu/test_momentum_op_mlu.py     |   3 +-
 .../tests/unittests/mlu/test_pool2d_op_mlu.py |   3 +-
 .../tests/unittests/mlu/test_scale_op_mlu.py  |   3 +-
 .../tests/unittests/mlu/test_top_k_op_mlu.py  |   3 +-
 15 files changed, 194 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index f88286288317b..884521301750c 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
-namespace math = paddle::operators::math;
 
 USE_OP(relu);
 USE_OP_DEVICE_KERNEL(relu, MLU);
diff --git a/paddle/fluid/operators/uniform_random_op_mlu.cc b/paddle/fluid/operators/uniform_random_op_mlu.cc
index 1600bedc6b2fa..2c5f13f5a9307 100644
--- a/paddle/fluid/operators/uniform_random_op_mlu.cc
+++ b/paddle/fluid/operators/uniform_random_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 namespace paddle {
@@ -57,14 +58,45 @@ class MLUUniformRandomKernel : public framework::OpKernel<T> {
 
     tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
-    const float min = static_cast<T>(ctx.Attr<float>("min"));
-    const float max = static_cast<T>(ctx.Attr<float>("max"));
+
+    Tensor cpu_tensor(tensor->dtype());
+    cpu_tensor.Resize(tensor->dims());
+    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    // make mlu seed
-    MLUCnnlRandomGeneratorDesc random_desc(/*is_mlu200=*/false, seed);
-    cnnlDataType_t data_type = ToCnnlDataType(tensor->type());
-    MLUCnnl::RandomUniform(ctx, size, /*data type=*/data_type,
-                           random_desc.get(), min, max, GetBasePtr(tensor));
+    auto engine = framework::GetCPURandomEngine(seed);
+
+    for (int64_t i = 0; i < size; ++i) {
+      data_cpu[i] = dist(*engine);
+    }
+
+    unsigned int diag_num =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
+    unsigned int diag_step =
+        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
+    auto diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
+    if (diag_num > 0) {
+      PADDLE_ENFORCE_GT(
+          size, (diag_num - 1) * (diag_step + 1),
+          platform::errors::InvalidArgument(
+              "ShapeInvalid: the diagonal's elements is equal (num-1) "
+              "* (step-1) with num %d, step %d,"
+              "It should be smaller than %d, but received %d",
+              diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size));
+      for (int64_t i = 0; i < diag_num; ++i) {
+        int64_t pos = i * diag_step + i;
+        data_cpu[pos] = diag_val;
+      }
+    }
+
+    // copy to MLU
+    framework::TensorCopy(
+        cpu_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), tensor);
+    ctx.template device_context<paddle::platform::MLUDeviceContext>().Wait();
   }
 };
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ed70a8638bf73..41e5e0469dcb4 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1269,6 +1269,8 @@ function card_test() {
         CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
         CUDA_DEVICE_COUNT=$(rocm-smi -i | grep GPU | wc -l)
+    elif [ "${WITH_MLU}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     else
         CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l)
     fi
@@ -2102,6 +2104,130 @@ set -ex
     fi   
 }
 
+function parallel_test_base_mlu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/mlu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit mlu tests ...
+    ========================================
+EOF
+
+set +x
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+
+        ut_actual_total_startTime_s=`date +%s`
+
+        card_test "$single_card_tests" 1 # run cases 1 job each time with single MLU
+        collect_failed_tests
+
+        # add unit test retry for MLU
+        rm -f $tmp_dir/*
+        exec_times=0
+        retry_unittests_record=''
+        retry_time=4
+        exec_time_array=('first' 'second' 'third' 'fourth')
+        parallel_failed_tests_exec_retry_threshold=120
+        exec_retry_threshold=30
+        is_retry_execuate=0
+        rerun_ut_startTime_s=`date +%s`
+        if [ -n "$failed_test_lists" ];then
+            if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then
+                bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists"    # cat logs for tiemout uts which killed by ctest
+            fi
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            while ( [ $exec_times -lt $retry_time ] )
+                do
+                    if [[ "${exec_times}" == "0" ]] ;then
+                        if [ $need_retry_ut_count -lt $parallel_failed_tests_exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    elif [[ "${exec_times}" == "1" ]] ;then
+                        need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        need_retry_ut_arr=(${need_retry_ut_str})
+                        need_retry_ut_count=${#need_retry_ut_arr[@]} 
+                        if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                            is_retry_execuate=0
+                        else
+                            is_retry_execuate=1
+                        fi
+                    fi
+                    if [[ "$is_retry_execuate" == "0" ]];then
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]] || [[ "${exec_times}" == "3" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"                    
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                tmp_one_tmp="$( echo $single_card_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                fi
+
+                            done
+
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1 # run cases 1 job each time with single GPU
+                        fi
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                    else 
+                        break
+                    fi
+
+                done
+        fi
+
+        rerun_ut_endTime_s=`date +%s`
+        
+        echo "ipipe_log_param_Rerun_TestCases_Total_Time: $[ $rerun_ut_endTime_s - $rerun_ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        ut_actual_total_endTime_s=`date +%s`
+        echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            show_ut_retry_result
+        fi
+set -ex
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -2117,6 +2243,8 @@ function parallel_test() {
         parallel_test_base_xpu
     elif [ "$WITH_ASCEND_CL" == "ON" ];then
         parallel_test_base_npu
+    elif [ "$WITH_MLU" == "ON" ];then
+        parallel_test_base_mlu
     else
         parallel_test_base_cpu ${PROC_RUN:-1}
     fi
@@ -2873,6 +3001,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_mlu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
index e229966c12d24..5e5c4c9a301e9 100755
--- a/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_accuracy_op_mlu.py
@@ -23,6 +23,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestAccuracyOp(OpTest):
     def setUp(self):
@@ -132,5 +134,4 @@ def test_api(self):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
index 2150e06381fac..4cbff21dfc496 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu.py
@@ -29,6 +29,7 @@
 from paddle.fluid import Program, program_guard
 
 _set_use_system_allocator(True)
+paddle.enable_static()
 
 
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
@@ -698,5 +699,4 @@ def test_reservespace(self):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
index f608344f6e036..7dd9dcdee57f9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_batch_norm_op_mlu_v2.py
@@ -26,6 +26,8 @@
 from paddle.fluid import Program, program_guard
 import paddle
 
+paddle.enable_static()
+
 
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
@@ -291,5 +293,4 @@ def init_test(self):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 71f79c34d2312..10356b124b2ea 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -25,6 +25,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestCastOpFp32ToFp16(OpTest):
     def setUp(self):
@@ -119,17 +121,7 @@ def test_errors(self):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.MLUPlace(0))
             self.assertRaises(TypeError, fluid.layers.cast, x1, 'int32')
-            # The input dtype of cast_op must be bool, float16, float32, float64, int32, int64, uint8.
-            x2 = fluid.layers.data(name='x2', shape=[4], dtype='int16')
-            self.assertRaises(TypeError, fluid.layers.cast, x2, 'int32')
-
-            def test_dtype_type():
-                x4 = fluid.layers.data(name='x4', shape=[4], dtype='int32')
-                output = fluid.layers.cast(x=x4, dtype='int16')
-
-            self.assertRaises(TypeError, test_dtype_type)
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
index 3bfa96b700112..ba37fcee15472 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_concat_op_mlu.py
@@ -176,7 +176,7 @@ def create_test_AxisTensor(parent):
     class TestConcatAxisTensor(parent):
         def setUp(self):
             self.op_type = "concat"
-            self.dtype = self.init_dtype()
+            self.init_dtype()
             self.init_test_data()
 
             self.inputs = {
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
index 5b6db6903fba0..3dc711c7d75e1 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_add_op_mlu.py
@@ -23,6 +23,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 class TestElementwiseAddOp(OpTest):
     def set_mlu(self):
@@ -523,5 +525,4 @@ def test_dygraph_add(self):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
index 6610127d382bd..a43b7d0164d7b 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_fill_constant_op_mlu.py
@@ -27,6 +27,8 @@
 import numpy as np
 from paddle.fluid import compiler, Program, program_guard
 
+paddle.enable_static()
+
 
 # Situation 1: Attr(shape) is a list(without tensor)
 class TestFillConstantOp1(OpTest):
@@ -449,5 +451,4 @@ def test_shape_tensor_list_dtype():
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
index 97a945dc90571..6f64196a586dd 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gaussian_random_op_mlu.py
@@ -26,6 +26,8 @@
 from op_test import OpTest
 import paddle
 
+paddle.enable_static()
+
 
 class TestGaussianRandomOp(OpTest):
     def setUp(self):
@@ -74,5 +76,4 @@ def set_attrs(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
index af09eabe787dc..a2cd69fee325a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_momentum_op_mlu.py
@@ -26,6 +26,8 @@
 import numpy
 from test_momentum_op import calculate_momentum_by_numpy
 
+paddle.enable_static()
+
 
 class TestMomentumOp1(OpTest):
     def setUp(self):
@@ -608,5 +610,4 @@ def test_main(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
index fd442c6205e98..1be3d2d85a422 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py
@@ -27,6 +27,8 @@
 from op_test import OpTest
 from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index
 
+paddle.enable_static()
+
 
 def pool2d_backward_navie(x,
                           ksize,
@@ -1016,5 +1018,4 @@ def test_upper_case(self):
 
 
 if __name__ == '__main__':
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
index bb7f438c4ab2b..53254c738d985 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_scale_op_mlu.py
@@ -25,6 +25,8 @@
 from paddle.fluid.op import Operator
 from paddle.static import Program, program_guard
 
+paddle.enable_static()
+
 
 class TestScaleOp(OpTest):
     def setUp(self):
@@ -201,5 +203,4 @@ def _executed_api(self, x, scale=1.0, bias=0.0):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
index 8ad0e787ab0cc..366f783ce0d2a 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_top_k_op_mlu.py
@@ -22,6 +22,8 @@
 import paddle
 import paddle.fluid.core as core
 
+paddle.enable_static()
+
 
 class TestTopkOp(OpTest):
     def setUp(self):
@@ -69,5 +71,4 @@ def init_dtype(self):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()

From 272b32fd8530cf1ddf56f508376c3120864a8a86 Mon Sep 17 00:00:00 2001
From: Li Min <11663212+limin2021@users.noreply.github.com>
Date: Wed, 2 Mar 2022 23:21:34 +0800
Subject: [PATCH 30/41] Replacing dropout eval eigen usage by cuda kernel
 (#40053)

* Replacing dropout eval eigen usage by cuda kernel
---
 paddle/fluid/operators/dropout_impl.cu.h | 28 +++++++++++++++++-------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index 2fa956a2e6515..cdcf683fb92c5 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -184,15 +184,15 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
                               bool is_fix_seed, int seed_val, const Tensor& x,
                               const Tensor* seed, Tensor* mask, Tensor* y) {
   auto& place = *dev_ctx.eigen_device();
+  int64_t x_numel = x.numel();
+  auto stream = dev_ctx.stream();
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
 
   if (!is_test) {
-    int64_t x_numel = x.numel();
-    auto stream = dev_ctx.stream();
     auto* mask_data = mask->data<uint8_t>();
     size_t size = phi::product(mask->dims());
 
-    auto* x_data = x.data<T>();
-    auto* y_data = y->data<T>();
     if (dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -254,12 +254,24 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     }
 #endif
   } else {
-    auto X = EigenMatrix<T>::Reshape(x, 1);
-    auto Y = EigenMatrix<T>::Reshape(*y, 1);
     if (upscale_in_train) {
-      Y.device(place) = X;
+// todo: can y share with data with x directly?
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                         hipMemcpyDeviceToDevice, stream));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel,
+                          cudaMemcpyDeviceToDevice, stream));
+#endif
     } else {
-      Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
+      T factor = static_cast<T>(1.0f - dropout_prob);
+      std::vector<const framework::Tensor*> ins = {&x};
+      std::vector<framework::Tensor*> outs = {y};
+      auto functor = phi::funcs::ScaleFunctor<T>(factor);
+      paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
+                                                                &outs, functor);
     }
   }
 }

From c16f85f95d0c42989e22c5ebae709f60506111a0 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Thu, 3 Mar 2022 01:24:26 +0800
Subject: [PATCH 31/41] Add the implementation of Gloo for ProcessGroup
 (#39892)

* add pg_gloo
---
 .../distributed/collective/CMakeLists.txt     |   3 +
 .../collective/ProcessGroupGloo.cc            | 308 ++++++++++++++++++
 .../distributed/collective/ProcessGroupGloo.h | 138 ++++++++
 paddle/fluid/distributed/store/store.h        |   2 +
 paddle/fluid/distributed/store/tcp_store.cc   |  85 +++--
 paddle/fluid/distributed/store/tcp_store.h    |  12 +-
 paddle/fluid/distributed/store/tcp_utils.cc   |   3 +-
 paddle/fluid/pybind/CMakeLists.txt            |   3 +
 paddle/fluid/pybind/communication.cc          |  12 +-
 paddle/fluid/pybind/distributed_py.cc         |  54 ++-
 .../tests/unittests/process_group_gloo.py     | 119 +++++++
 .../test_collective_process_group.py          |   3 +
 12 files changed, 701 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.cc
 create mode 100644 paddle/fluid/distributed/collective/ProcessGroupGloo.h
 create mode 100644 python/paddle/fluid/tests/unittests/process_group_gloo.py

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index a5b40f8aa07d7..96bc4a710f8c1 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,4 +1,7 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+if (WITH_DISTRIBUTE)
+  cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper)
+endif()
 cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
new file mode 100644
index 0000000000000..03ad48f560a0a
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -0,0 +1,308 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+
+#ifdef _WIN32
+#include <gloo/common/win.h>
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#else
+#include <netdb.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#endif
+
+#include <gloo/broadcast.h>
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+#ifdef _WIN32
+#define GENERATE_FUNC(type, func, ...)       \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(__VA_ARGS__);              \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(__VA_ARGS__);             \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(__VA_ARGS__);      \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(__VA_ARGS__);            \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(__VA_ARGS__);            \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+
+#define HOST_NAME_MAX 256
+
+#else
+#define GENERATE_FUNC(type, func, args...)   \
+  switch (type) {                            \
+    case experimental::DataType::FLOAT32:    \
+      func<float>(args);                     \
+      break;                                 \
+    case experimental::DataType::FLOAT64:    \
+      func<double>(args);                    \
+      break;                                 \
+    case experimental::DataType::FLOAT16:    \
+      func<gloo::float16>(args);             \
+      break;                                 \
+    case experimental::DataType::INT32:      \
+      func<int32_t>(args);                   \
+      break;                                 \
+    case experimental::DataType::INT64:      \
+      func<int64_t>(args);                   \
+      break;                                 \
+    default:                                 \
+      VLOG(0) << "Error: Unknown DataType."; \
+      exit(-1);                              \
+  }
+#endif
+
+typedef void (*reduce_func)(void*, const void*, const void*, size_t);
+
+template <typename T>
+reduce_func get_function(const ReduceOp& r) {
+  switch (r) {
+    case ReduceOp::SUM:
+      return reduce_func(&::gloo::sum<T>);
+    case ReduceOp::PRODUCT:
+      return reduce_func(&::gloo::product<T>);
+    case ReduceOp::MIN:
+      return reduce_func(&::gloo::min<T>);
+    case ReduceOp::MAX:
+      return reduce_func(&::gloo::max<T>);
+    case ReduceOp::AVG:
+      VLOG(0) << "Error: Unsupported ReduceOp::AVG.";
+      exit(-1);
+  }
+
+  VLOG(0) << "Error: Unknown ReduceOp.";
+  exit(-1);
+}
+
+bool CheckTensorsInCPUPlace(const std::vector<Tensor>& tensors) {
+  return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) {
+    return t.place() == PlaceType::kCPU;
+  });
+}
+
+template <typename T>
+T* get_data(const Tensor& tensor) {
+  auto raw_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return static_cast<T*>(raw_tensor->data());
+}
+
+template <typename T>
+std::vector<T*> get_multi_data(const std::vector<Tensor>& tensors) {
+  std::vector<T*> ret(tensors.size());
+  for (size_t i = 0; i < tensors.size(); i++) {
+    ret[i] = get_data<T>(tensors[i]);
+  }
+  return ret;
+}
+
+template <typename T, typename P>
+void set_output(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setOutput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_input(P& opts, const Tensor& tensor) {  // NOLINT
+  opts.setInput(get_data<T>(tensor), tensor.numel());
+}
+
+template <typename T, typename P>
+void set_outputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setOutputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+template <typename T, typename P>
+void set_inputs(P& opts, const std::vector<Tensor>& tensors) {  // NOLINT
+  opts.setInputs(get_multi_data<T>(tensors), tensors[0].numel());
+}
+
+ProcessGroupGloo::GlooTask::GlooTask(int rank,
+                                     const std::vector<Tensor>& inputs,
+                                     CommType comm_type)
+    : ProcessGroup::Task(rank, inputs, comm_type) {
+  PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true,
+                    platform::errors::Fatal(
+                        "Only CPU place is supported for ProcessGroupGloo."));
+}
+
+ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr<GlooStore>& store,
+                                   int rank, int world_size,
+                                   const std::shared_ptr<GlooOptions> options)
+    : ProcessGroup(rank, world_size), _tag(0), _store(store) {
+  _context = std::make_shared<gloo::rendezvous::Context>(rank, world_size);
+  auto prefix_store =
+      ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store);
+  _context->connectFullMesh(prefix_store, options->device);
+}
+
+class BroadcastGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  BroadcastGlooTask(const std::shared_ptr<gloo::Context>& context,
+                    const std::vector<Tensor>& inputs, int rank, int root,
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST),
+        _context(context),
+        _root(root),
+        _inputs(inputs),
+        _tag(tag) {}
+
+  void Run() override { _do_broadcast(_inputs[0]); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  const int _root;
+  std::vector<Tensor> _inputs{};
+  const uint32_t _tag;
+
+  void _do_broadcast(const Tensor& tensor) {
+    gloo::BroadcastOptions opts(_context);
+    const auto& dtype = tensor.type();
+    GENERATE_FUNC(dtype, set_output, opts, tensor);
+    opts.setRoot(_root);
+    opts.setTag(_tag);
+    gloo::broadcast(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::Broadcast(
+    std::vector<Tensor>& inputs, const BroadcastOptions& opts) {
+  auto root = opts.source_rank;
+  std::unique_ptr<BroadcastGlooTask> task;
+  auto tag = next_tag();
+  auto context = get_context();
+  task = std::make_unique<BroadcastGlooTask>(context, inputs, rank_, root, tag);
+  task->Run();
+  return task;
+}
+
+class AllreduceGlooTask : public ProcessGroupGloo::GlooTask {
+ public:
+  AllreduceGlooTask(int rank, const std::shared_ptr<gloo::Context>& context,
+                    std::vector<Tensor>& inputs, ReduceOp reduce_op,  // NOLINT
+                    uint32_t tag)
+      : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE),
+        _context(context),
+        _inputs(inputs),
+        _reduce_op(reduce_op),
+        _tag(tag) {}
+
+  void Run() override { _do_allreduce(_inputs); }
+
+ private:
+  std::shared_ptr<gloo::Context> _context;
+  std::vector<Tensor> _inputs;
+  const ReduceOp _reduce_op;
+  uint32_t _tag;
+
+  gloo::AllreduceOptions::Func _get_function(const experimental::DataType type,
+                                             const ReduceOp op) {
+    gloo::AllreduceOptions::Func fn;
+    GENERATE_FUNC(type, _get_function_impl, fn, op);
+    return fn;
+  }
+
+  template <typename T>
+  void _get_function_impl(gloo::AllreduceOptions::Func& fn,  // NOLINT
+                          const ReduceOp op) {
+    fn = get_function<T>(op);
+  }
+
+  void _do_allreduce(std::vector<Tensor>& tensors) {  // NOLINT
+    const auto& dtype = tensors[0].type();
+    gloo::AllreduceOptions opts(_context);
+    GENERATE_FUNC(dtype, set_inputs, opts, tensors);
+    GENERATE_FUNC(dtype, set_outputs, opts, tensors);
+    opts.setReduceFunction(_get_function(dtype, _reduce_op));
+    opts.setTag(_tag);
+    gloo::allreduce(opts);
+  }
+};
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<Tensor>& inputs, const AllreduceOptions& opts) {
+  auto tag = next_tag();
+  std::shared_ptr<GlooTask> task;
+  auto context = get_context();
+  task = std::make_shared<AllreduceGlooTask>(rank_, context, inputs,
+                                             opts.reduce_op, tag);
+  task->Run();
+  return task;
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.iface = ifname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) {
+  ::gloo::transport::tcp::attr attr;
+  attr.hostname = hostname;
+  return ::gloo::transport::tcp::CreateDevice(attr);
+}
+
+std::shared_ptr<::gloo::transport::Device>
+ProcessGroupGloo::createDefaultDevice() {
+  std::array<char, HOST_NAME_MAX> hostname{};
+  auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX);
+  PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal(
+                                "Get hostname error for createDefaultDevice."));
+  ::addrinfo* result;
+  result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC);
+  ::addrinfo* cur;
+  for (cur = result; cur != nullptr; cur = cur->ai_next) {
+    SocketType socket =
+        ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol);
+    if (socket == -1) {
+      continue;
+    }
+    ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen);
+#ifdef _WIN32
+    closesocket(socket);
+#else
+    close(socket);
+#endif
+    if (ret == -1) {
+      continue;
+    }
+    break;
+  }
+  freeaddrinfo(result);
+  if (cur != nullptr) {
+    return createDeviceForHostname(hostname.data());
+  }
+  return createDeviceForHostname("127.0.0.1");
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
new file mode 100644
index 0000000000000..d989939fcb872
--- /dev/null
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <future>
+#include <mutex>
+
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+
+#ifdef PADDLE_WITH_GLOO
+#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
+#endif
+
+#include "paddle/fluid/distributed/store/store.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+
+constexpr const char* GLOO_BACKEND_NAME = "GLOO";
+
+namespace paddle {
+namespace distributed {
+
+class ProcessGroupGloo : public ProcessGroup {
+ public:
+  class GlooTask : public ProcessGroup::Task,
+                   public std::enable_shared_from_this<GlooTask> {
+   public:
+    explicit GlooTask(int rank, const std::vector<Tensor>& input_tensors,
+                      CommType comm_type);
+
+    ~GlooTask() = default;
+
+    virtual void Run() = 0;
+    bool Wait(std::chrono::milliseconds timeout) override { return true; }
+    bool IsCompleted() override { return true; }
+    void Synchronize() override {}
+
+   protected:
+    friend class ProcessGroupGloo;
+  };
+
+  class GlooStore : public ::gloo::rendezvous::Store {
+   public:
+    explicit GlooStore(
+        const std::shared_ptr<paddle::distributed::TCPStore>& store)
+        : _store(store) {}
+
+    ~GlooStore() = default;
+
+    std::vector<char> get(const std::string& key) override {
+      VLOG(3) << "GlooStore::get";
+      auto value = _store->get(key);
+      return std::vector<char>(value.begin(), value.end());
+    }
+
+    void wait(const std::vector<std::string>& keys) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+    }
+
+    void set(const std::string& key, const std::vector<char>& value) override {
+      VLOG(3) << "GlooStore::set";
+      std::vector<uint8_t> tmp(value.begin(), value.end());
+      _store->set(key, tmp);
+    }
+
+    void wait(const std::vector<std::string>& keys,
+              const std::chrono::milliseconds& timeout) override {
+      VLOG(3) << "GlooStore::wait";
+      for (auto& key : keys) {
+        _store->wait(key);
+      }
+      // wait(keys);
+    }
+
+   protected:
+    std::shared_ptr<paddle::distributed::TCPStore> _store;
+  };
+
+  class GlooOptions {
+   public:
+    GlooOptions() = default;
+    ~GlooOptions() = default;
+    static std::shared_ptr<GlooOptions> create() {
+      return std::make_shared<GlooOptions>();
+    }
+    std::shared_ptr<::gloo::transport::Device> device;
+  };
+
+  explicit ProcessGroupGloo(const std::shared_ptr<GlooStore>& store, int rank,
+                            int world_size,
+                            std::shared_ptr<GlooOptions> options);
+
+  ~ProcessGroupGloo() = default;
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<Tensor>& inputs,
+      const BroadcastOptions& = BroadcastOptions()) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<Tensor>& inputs,
+      const AllreduceOptions& opts = AllreduceOptions()) override;
+
+  std::shared_ptr<::gloo::Context> get_context() { return _context; }
+  uint64_t next_tag() { return _tag++; }
+
+  const std::string GetBackendName() const override {
+    return GLOO_BACKEND_NAME;
+  }
+
+  // Helper functions for Gloo.
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname(
+      const std::string& hostname);
+  static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface(
+      const std::string& ifname);
+  static std::shared_ptr<::gloo::transport::Device> createDefaultDevice();
+
+ protected:
+  uint32_t _tag;
+  std::shared_ptr<gloo::rendezvous::Context> _context;
+  std::shared_ptr<GlooStore> _store;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h
index 2673314d222d2..2581a74d7e818 100644
--- a/paddle/fluid/distributed/store/store.h
+++ b/paddle/fluid/distributed/store/store.h
@@ -32,6 +32,8 @@ class Store {
   virtual int64_t add(const std::string& key, int64_t value) = 0;
   virtual std::vector<uint8_t> get(const std::string& key) = 0;
   virtual void wait(const std::string& key) = 0;
+  virtual void set(const std::string& key,
+                   const std::vector<uint8_t>& value) = 0;
 
   virtual const std::chrono::seconds& timeout() const { return _timeout; }
 
diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc
index de85ac0d910e9..8675981955dac 100644
--- a/paddle/fluid/distributed/store/tcp_store.cc
+++ b/paddle/fluid/distributed/store/tcp_store.cc
@@ -27,11 +27,13 @@ namespace detail {
 
 constexpr int INFTIME = -1;
 
-std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket) {
-  return std::make_unique<MasterDaemon>(socket);
+std::unique_ptr<MasterDaemon> MasterDaemon::start(SocketType socket,
+                                                  int nranks) {
+  return std::make_unique<MasterDaemon>(socket, nranks);
 }
 
-MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) {
+MasterDaemon::MasterDaemon(SocketType socket, int nranks)
+    : _listen_socket(socket), _nranks(nranks) {
   _background_thread = std::thread{&MasterDaemon::run, this};
 }
 
@@ -64,6 +66,13 @@ void MasterDaemon::_do_add(SocketType socket) {
   tcputils::send_value<int64_t>(socket, new_value);
 }
 
+void MasterDaemon::_do_set(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_set";
+  std::string key = tcputils::receive_string(socket);
+  auto value = tcputils::receive_vector<uint8_t>(socket);
+  _store[key] = value;
+}
+
 void MasterDaemon::_do_get(SocketType socket) {
   std::string key = tcputils::receive_string(socket);
   auto iter = _store.find(key);
@@ -71,16 +80,15 @@ void MasterDaemon::_do_get(SocketType socket) {
       iter, _store.end(),
       platform::errors::InvalidArgument("Key %s not found in TCPStore.", key));
   std::vector<uint8_t> value = iter->second;
-  VLOG(3) << "TCPStore: value ("
-          << std::stoll(std::string(reinterpret_cast<char*>(value.data()),
-                                    value.size()))
-          << ") for key (" << key << ").";
   tcputils::send_vector<uint8_t>(socket, value);
 }
 
 void MasterDaemon::_do_stop(SocketType socket) {
+  VLOG(3) << "MasterDaemon::_do_stop";
   ReplyType value = ReplyType::STOP_WAIT;
-  _stop = true;
+  if (--_nranks == 0) {
+    _stop = true;
+  }
   tcputils::send_value<ReplyType>(socket, value);
 }
 
@@ -140,21 +148,27 @@ void MasterDaemon::run() {
         case Command::GET:
           _do_get(fds[i].fd);
           break;
+        case Command::SET:
+          _do_set(fds[i].fd);
+          break;
         case Command::WAIT:
           _do_wait(fds[i].fd);
           break;
         case Command::STOP:
           _do_stop(fds[i].fd);
           break;
+        default:
+          VLOG(0) << "Unknow command: " << static_cast<int>(command);
+          exit(-1);
       }
     }
   }
 }
 
-std::unique_ptr<TCPServer> TCPServer::create(uint16_t port) {
+std::unique_ptr<TCPServer> TCPServer::create(uint16_t port, int nranks) {
   int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET);
   auto server = std::make_unique<TCPServer>();
-  server->_master_daemon = MasterDaemon::start(socket);
+  server->_master_daemon = MasterDaemon::start(socket, nranks);
   return server;
 }
 
@@ -200,7 +214,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master,
                    size_t num_workers, std::chrono::seconds timeout)
     : Store(timeout), _is_master(is_master), _num_workers(num_workers) {
   if (_is_master) {
-    _server = detail::TCPServer::create(port);
+    _server = detail::TCPServer::create(port, num_workers);
   }
 
   _client = detail::TCPClient::connect(host, port);
@@ -213,36 +227,41 @@ void TCPStore::waitWorkers() {
   }
   add(_init_key, 1);
 
-  if (_server) {
-    auto begin = std::chrono::steady_clock::now();
-    do {
-      auto value = get(_init_key);
-      int completed = std::stoi(std::string(value.begin(), value.end()));
-      VLOG(3) << completed << " worker ready, total " << _num_workers;
-      if (completed >= _num_workers) {
-        break;
-      }
-      const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
-          std::chrono::steady_clock::now() - begin);
-
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
-        PADDLE_ENFORCE_EQ(
-            completed, _num_workers,
-            platform::errors::InvalidArgument(
-                "TCPStore timeouted and not all workers got ready."));
-      }
-    } while (true);
-  }
+  auto begin = std::chrono::steady_clock::now();
+  do {
+    auto value = get(_init_key);
+    int completed = std::stoi(std::string(value.begin(), value.end()));
+    VLOG(3) << completed << " worker ready, total " << _num_workers;
+    if (completed >= _num_workers) {
+      break;
+    }
+    const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
+        std::chrono::steady_clock::now() - begin);
+
+    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) {
+      PADDLE_ENFORCE_EQ(
+          completed, _num_workers,
+          platform::errors::InvalidArgument(
+              "TCPStore timeouted and not all workers got ready."));
+    }
+  } while (true);
   VLOG(3) << "TCPStore initialized.";
 }
 
 int64_t TCPStore::add(const std::string& key, int64_t value) {
+  VLOG(3) << "TCPStore add.";
   _client->send_command_for_key(Command::ADD, _key_prefix + key);
   _client->send_value<std::int64_t>(value);
   return _client->receive_value<std::int64_t>();
 }
 
+void TCPStore::set(const std::string& key, const std::vector<uint8_t>& value) {
+  VLOG(3) << "TCPStore set.";
+  _client->send_command_for_key(Command::SET, _key_prefix + key);
+  _client->send_vector<std::uint8_t>(value);
+}
+
 std::vector<uint8_t> TCPStore::get(const std::string& key) {
   wait(key);
   _client->send_command_for_key(Command::GET, _key_prefix + key);
@@ -252,6 +271,7 @@ std::vector<uint8_t> TCPStore::get(const std::string& key) {
 
 void TCPStore::wait(const std::string& key) {
   ReplyType reply;
+  VLOG(3) << "TCPStore wait.";
   do {
     _client->send_command_for_key(Command::WAIT, _key_prefix + key);
 
@@ -262,6 +282,7 @@ void TCPStore::wait(const std::string& key) {
 
 TCPStore::~TCPStore() {
   _client->send_command_for_key(Command::STOP, "");
+  VLOG(3) << "~TCPStore";
   ReplyType ret = _client->receive_value<ReplyType>();
   PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT,
                     platform::errors::InvalidArgument(
diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h
index cd706dd6640ac..17c1d8ea30a42 100644
--- a/paddle/fluid/distributed/store/tcp_store.h
+++ b/paddle/fluid/distributed/store/tcp_store.h
@@ -27,15 +27,16 @@ namespace paddle {
 namespace distributed {
 
 enum class ReplyType { WAITING, STOP_WAIT };
-enum class Command { ADD, GET, WAIT, STOP };
+enum class Command { ADD, GET, SET, WAIT, STOP };
 
 namespace detail {
 
 class MasterDaemon {
  public:
-  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket);
+  static std::unique_ptr<MasterDaemon> start(SocketType listen_socket,
+                                             int nranks);
   MasterDaemon() = delete;
-  explicit MasterDaemon(SocketType listen_socket);
+  explicit MasterDaemon(SocketType listen_socket, int nranks);
   ~MasterDaemon();
 
  private:
@@ -43,18 +44,20 @@ class MasterDaemon {
   void _do_add(SocketType socket);
   void _do_wait(SocketType socket);
   void _do_get(SocketType socket);
+  void _do_set(SocketType socket);
   void _do_stop(SocketType socket);
   SocketType _listen_socket;
   std::vector<SocketType> _sockets;
   std::unordered_map<std::string, std::vector<uint8_t>> _store;
   std::thread _background_thread{};
+  int _nranks;
   bool _stop = false;
 };
 
 class TCPServer {
  public:
   TCPServer() = default;
-  static std::unique_ptr<TCPServer> create(std::uint16_t port);
+  static std::unique_ptr<TCPServer> create(std::uint16_t port, int nranks);
 
  private:
   std::unique_ptr<MasterDaemon> _master_daemon;
@@ -97,6 +100,7 @@ class TCPStore : public Store {
   int64_t add(const std::string& key, int64_t value) override;
   std::vector<uint8_t> get(const std::string& key) override;
   void wait(const std::string& key) override;
+  void set(const std::string& key, const std::vector<uint8_t>& value) override;
 
  private:
   void waitWorkers();
diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc
index d0561d0b9a9c5..a28cba288333d 100644
--- a/paddle/fluid/distributed/store/tcp_utils.cc
+++ b/paddle/fluid/distributed/store/tcp_utils.cc
@@ -46,9 +46,10 @@ ::addrinfo* get_addr_info(const std::string host, const std::string port,
   hints.ai_socktype = SOCK_STREAM;
 
   const char* node = host.empty() ? nullptr : host.c_str();
+  const char* port_cstr = port.empty() ? nullptr : port.c_str();
 
   int n;
-  n = ::getaddrinfo(node, port.c_str(), &hints, &res);
+  n = ::getaddrinfo(node, port_cstr, &hints, &res);
   const char* gai_err = ::gai_strerror(n);
   const char* proto =
       (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : "");
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 48d42f803a824..5e61133510d6a 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -85,6 +85,9 @@ if(NOT ON_INFER)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
+  if (WITH_GLOO)
+    set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo)
+  endif()
   set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc)
 endif()
 
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index a0d2777f825dc..c01accaf598aa 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -31,9 +31,15 @@ namespace pybind {
 using TCPStore = paddle::distributed::TCPStore;
 
 void BindTCPStore(py::module* m) {
-  py::class_<TCPStore>(*m, "TCPStore")
-      .def(
-          py::init<std::string, uint16_t, bool, size_t, std::chrono::seconds>())
+  py::class_<TCPStore, std::shared_ptr<TCPStore>>(*m, "TCPStore")
+      .def(py::init([](std::string hostname, uint16_t port, bool is_master,
+                       size_t world_size, std::chrono::seconds timeout) {
+             return std::make_shared<TCPStore>(hostname, port, is_master,
+                                               world_size, timeout);
+           }),
+           py::arg("hostname"), py::arg("port"), py::arg("is_master"),
+           py::arg("world_size"), py::arg("timeout"),
+           py::call_guard<py::gil_scoped_release>())
       .def("add", &TCPStore::add)
       .def("get", &TCPStore::get);
 }
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index a4a1d07db2cb9..3b5644764a5d5 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -35,6 +35,11 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #endif
 
+#if defined(PADDLE_WITH_GLOO)
+#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h"
+#include "paddle/fluid/distributed/store/tcp_store.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -42,6 +47,14 @@ namespace pybind {
 
 using Tensor = paddle::experimental::Tensor;
 
+#if defined(PADDLE_WITH_GLOO)
+using ProcessGroupGloo = paddle::distributed::ProcessGroupGloo;
+using GlooStore = paddle::distributed::ProcessGroupGloo::GlooStore;
+using GlooOptions = paddle::distributed::ProcessGroupGloo::GlooOptions;
+#endif
+
+static std::string GLOO_SOCKET_IFNAME_ENV = "GLOO_SOCKET_IFNAME";  // NOLINT
+
 void BindDistributed(py::module *m) {
   py::enum_<distributed::ReduceOp>(*m, "ReduceOp")
       .value("SUM", distributed::ReduceOp::SUM)
@@ -129,6 +142,7 @@ void BindDistributed(py::module *m) {
       *m, "ProcessGroupNCCL", ProcessGroup)
       .def(py::init<const distributed::ProcessGroupStrategy &, int, int>(),
            py::call_guard<py::gil_scoped_release>());
+#endif
 
   py::class_<distributed::ProcessGroup::Task,
              std::shared_ptr<distributed::ProcessGroup::Task>>(*m, "task")
@@ -138,7 +152,6 @@ void BindDistributed(py::module *m) {
            py::call_guard<py::gil_scoped_release>())
       .def("synchronize", &distributed::ProcessGroup::Task::Synchronize,
            py::call_guard<py::gil_scoped_release>());
-#endif
 
   // define parallel strategy, it will be removed
   py::class_<distributed::ProcessGroupStrategy> pg_strategy(
@@ -178,6 +191,45 @@ void BindDistributed(py::module *m) {
                       self.nrings_ = nrings;
                     });
 
+#if defined(PADDLE_WITH_GLOO)
+  py::class_<GlooOptions>(*m, "GlooOptions")
+      .def(py::init<>())
+      .def_readwrite("_device", &GlooOptions::device)
+      .def_static("create", &GlooOptions::create);
+
+  py::class_<GlooStore, std::shared_ptr<GlooStore>>(*m, "GlooStore")
+      .def(py::init(
+               [](const std::shared_ptr<paddle::distributed::TCPStore> &store) {
+                 return std::make_shared<GlooStore>(store);
+               }),
+           py::call_guard<py::gil_scoped_release>());
+
+  py::class_<ProcessGroupGloo, std::shared_ptr<ProcessGroupGloo>>(
+      *m, "ProcessGroupGloo", ProcessGroup)
+      .def(py::init<const std::shared_ptr<GlooStore> &, int, int,
+                    std::shared_ptr<GlooOptions> &>(),
+           py::call_guard<py::gil_scoped_release>())
+      .def(py::init([](const std::shared_ptr<GlooStore> &store, int rank,
+                       int world_size) {
+             auto opts = GlooOptions::create();
+             char *ifname = getenv(GLOO_SOCKET_IFNAME_ENV.c_str());
+             if (ifname && strlen(ifname) > 1) {
+               opts->device = ProcessGroupGloo::createDeviceForInterface(
+                   std::string(ifname));
+             } else {
+               opts->device = ProcessGroupGloo::createDefaultDevice();
+             }
+             return std::make_shared<ProcessGroupGloo>(store, rank, world_size,
+                                                       opts);
+           }),
+           py::arg("store"), py::arg("rank"),
+           py::arg("world_size"),  // py::arg("timeout") =
+                                   // kProcessGroupDefaultTimeout,
+           py::call_guard<py::gil_scoped_release>())
+      .def_static("create_default_device",
+                  &ProcessGroupGloo::createDefaultDevice);
+#endif
+
   m->def("eager_assign_group_by_size",
          [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
             std::vector<size_t> group_size_limits,
diff --git a/python/paddle/fluid/tests/unittests/process_group_gloo.py b/python/paddle/fluid/tests/unittests/process_group_gloo.py
new file mode 100644
index 0000000000000..5420e1d36b369
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/process_group_gloo.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import random
+import numpy as np
+import os
+import shutil
+
+import paddle
+from paddle.fluid import core
+import datetime
+from datetime import timedelta
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+from paddle.fluid.dygraph.parallel import ParallelEnv
+
+
+class TestProcessGroupFp32(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+        self.config()
+
+    def config(self):
+        self.dtype = "float32"
+        self.shape = (2, 10, 5)
+
+    def test_create_process_group_gloo(self):
+        with _test_eager_guard():
+            nranks = ParallelEnv().nranks
+            rank = ParallelEnv().local_rank
+            is_master = True if rank == 0 else False
+            store = paddle.fluid.core.TCPStore("127.0.0.1", 6172, is_master,
+                                               nranks, datetime.timedelta(0))
+            gloo_store = paddle.fluid.core.GlooStore(store)
+            opt = paddle.fluid.core.GlooOptions()
+            pg = paddle.fluid.core.ProcessGroupGloo(gloo_store, rank, nranks)
+
+            # test allreduce sum
+            # rank 0
+            paddle.device.set_device('cpu')
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            sum_result = x + y
+            if rank == 0:
+                task = pg.allreduce(tensor_x)
+                task.wait()
+                assert np.array_equal(tensor_x, sum_result)
+            else:
+                task = pg.allreduce(tensor_y)
+                task.wait()
+                assert np.array_equal(tensor_y, sum_result)
+
+            print("test allreduce sum api ok")
+
+            # test allreduce max
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            max_result = paddle.maximum(tensor_x, tensor_y)
+
+            if rank == 0:
+                task = pg.allreduce(tensor_x, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_x, max_result)
+            else:
+                task = pg.allreduce(tensor_y, core.ReduceOp.MAX)
+                task.wait()
+                assert np.array_equal(tensor_y, max_result)
+
+            print("test allreduce max api ok")
+
+            # test broadcast
+            # rank 0
+            x = np.random.random(self.shape).astype(self.dtype)
+            tensor_x = paddle.to_tensor(x)
+            # rank 1
+            y = np.random.random(self.shape).astype(self.dtype)
+            tensor_y = paddle.to_tensor(y)
+
+            broadcast_result = paddle.assign(tensor_x)
+            if rank == 0:
+                task = pg.broadcast(tensor_x, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_x)
+            else:
+                task = pg.broadcast(tensor_y, 0)
+                task.synchronize()
+                assert task.is_completed()
+                assert np.array_equal(broadcast_result, tensor_y)
+            print("test broadcast api ok")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
index 6ae5424a882da..58baa0a2fa944 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_process_group.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py
@@ -22,6 +22,9 @@ class TestProcessGroup(TestMultipleGpus):
     def test_process_group_nccl(self):
         self.run_mnist_2gpu('process_group_nccl.py')
 
+    def test_process_group_gloo(self):
+        self.run_mnist_2gpu('process_group_gloo.py')
+
 
 if __name__ == "__main__":
     unittest.main()

From ebd0f51287ad3ea0c8d91ee899b9edfcbc351c8e Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Thu, 3 Mar 2022 09:32:42 +0800
Subject: [PATCH 32/41] Move bn to pten (#39347)

* add bn cpu version; test=develop

* move batch norm to pten

* move batch norm to pten; test=develop

* fix bug; test=develop

* fix func::tranpose depend bug; test=develop

* fix compile bugs; test=develop

* fix use_op batch_norm bug; test=develop

* fix cudnn bn add relu test; test=develop

* fix pten context build and double grad bug; test= develop

* remve useless code; test=develop

* add batch norm gpu fp16 support; test=develop

* fix test bn op bug; test=develop

* remove output dtype set; test=develop

* fix bug; test=develop

* fix bug; test=develop

* fix applay pass to program bug; test=develop

* revert to develop; test=develop

* fix rocm bug; test=develop

* revert operator to develop; test=develop

* fix pre_commit; test=develop

* fix statci check error; test=develop

* resolve conflict; test=develop

* ana batch norm bug;

* revert batch norm op

* resolve conlict

* fix nan inf and speed bug; test=develop

* fix bug; test=develop

* fix error; test=develop

* test expand op; test=develop

* fix bug; test=develop

* resolve confilct

* resolve confilct; test=develop

* polish code; test=develop

* polish code; test=develop

* change mutable data to ctx alloc; test=develop

* make format same with ci; test=develop

* fix format error with ci; test=develop
---
 .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc |    9 +-
 paddle/fluid/framework/operator.cc            |    2 -
 paddle/fluid/operators/batch_norm_op.cc       |   12 -
 paddle/fluid/operators/batch_norm_op.cu       | 1322 -----------------
 .../operators/fused/cudnn_bn_add_relu_test.cc |    2 +-
 paddle/fluid/operators/inplace_abn_op.cc      |   83 +-
 paddle/fluid/operators/inplace_abn_op.cu      |   81 +-
 paddle/fluid/operators/norm_utils.cu.h        |   47 +-
 paddle/phi/kernels/batch_norm_grad_kernel.h   |   90 ++
 paddle/phi/kernels/batch_norm_kernel.h        |   43 +
 .../phi/kernels/cpu/batch_norm_grad_kernel.cc |  674 +++++++++
 paddle/phi/kernels/cpu/batch_norm_kernel.cc   |  204 +++
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 1038 +++++++++++++
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |  680 +++++++++
 paddle/phi/kernels/gpu/batch_norm_utils.h     |  142 ++
 paddle/phi/ops/compat/batch_norm_sig.cc       |   89 ++
 .../dygraph_to_static/test_mobile_net.py      |    1 +
 .../unittests/test_apply_pass_to_program.py   |    1 +
 .../tests/unittests/test_batch_norm_op.py     |   16 +-
 .../tests/unittests/test_batch_norm_op_v2.py  |   14 +-
 .../fluid/tests/unittests/test_conv2d_op.py   |    2 +
 .../tests/unittests/test_expand_v2_op.py      |    1 +
 .../tests/unittests/test_inplace_abn_op.py    |    9 +-
 .../tests/unittests/test_norm_nn_grad.py      |    2 +
 .../unittests/test_program_prune_backward.py  |    2 +
 .../fluid/tests/unittests/test_reshape_op.py  |    1 +
 26 files changed, 3175 insertions(+), 1392 deletions(-)
 create mode 100644 paddle/phi/kernels/batch_norm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/batch_norm_kernel.h
 create mode 100644 paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/batch_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h
 create mode 100644 paddle/phi/ops/compat/batch_norm_sig.cc

diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 96aa95bde3374..11190309814e7 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <string>
+#include <unordered_set>
 
-#include <gtest/gtest.h>
 #include <boost/logic/tribool.hpp>
-#include <random>
-#include <unordered_set>
+
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
 #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
@@ -25,7 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
 USE_OP(conv2d_transpose);
 USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6414dd455db4f..8ebc64e5f2cb2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext(
                                                        vector_int_attr.end());
           pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
-        // TODO(YuanRisheng) Need support vector<int64_t> attr
-
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int32_t>))) {
         const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 949cf021cf0fa..174207deb08b8 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
                   ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
                   ops::BatchNormDoubleGradOpInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index d59396db1517f..a19b087245a89 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
-template <typename T, framework::DataLayout layout>
-static __global__ void BNForwardInference(
-    const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  int num = N * C * HxW;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> x_sub_mean =
-        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
-    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
-    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
-    const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
-    const double epsilon, double exponentialAverageFactor, T *y,
-    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
-    BatchNormParamType<T> *save_mean,
-    BatchNormParamType<T> *save_inv_variance) {
-  int outer_size = C;
-  int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> variance_val;
-  __shared__ BatchNormParamType<T> inv_var_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
-      x_sum += x_i;
-      x_square_sum += x_i * x_i;
-    }
-    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-    x_square_sum =
-        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      mean_val = x_sum / inner_size;
-      variance_val = x_square_sum / inner_size - mean_val * mean_val;
-      inv_var_val = 1 / sqrt(variance_val + epsilon);
-
-      if (save_mean && save_inv_variance) {
-        save_mean[i] = mean_val;
-        save_inv_variance[i] = inv_var_val;
-      }
-      mean[i] = (1 - exponentialAverageFactor) * mean_val +
-                exponentialAverageFactor * mean[i];
-      variance[i] = (1 - exponentialAverageFactor) * variance_val +
-                    exponentialAverageFactor * variance[i];
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> x_sub_mean =
-          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
-      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
-    }
-  }
-}
-
-template <typename T>
-class BatchNormKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    float momentum = ctx.Attr<float>("momentum");
-    const bool is_test = ctx.Attr<bool>("is_test");
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool trainable_stats = ctx.Attr<bool>("trainable_statistics");
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    bool test_mode = is_test && (!trainable_stats);
-
-    // Get the size for each dimension.
-    // NCHW [batch_size, in_channels, in_height, in_width]
-    const auto *x = ctx.Input<Tensor>("X");
-    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5"
-            "But received: the size of input's dimensions is [%d]",
-            x_dims.size()));
-
-    auto *y = ctx.Output<Tensor>("Y");
-    y->mutable_data<T>(ctx.GetPlace());
-
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    auto dtype = platform::CudnnDataType<T>::type;
-
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        test_mode ||
-        (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
-
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_y(y->type());
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, y,
-                                                           &transformed_y);
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_y.ShareDataWith(*y);
-    }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-    cudnnTensorDescriptor_t data_desc_;
-    cudnnTensorDescriptor_t bn_param_desc_;
-    cudnnBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-
-    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-      LOG(ERROR) << "Provided epsilon is smaller than "
-                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                 << "CUDNN_BN_MIN_EPSILON instead.";
-    }
-    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-    } else if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#else
-    if (H == 1 && W == 1) {
-      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-    } else {
-      mode_ = CUDNN_BATCHNORM_SPATIAL;
-    }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-    VLOG(3) << "Setting descriptors.";
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * D * C, 1, W * D * C, D * C, C};
-    }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// Note: PERSISTENT not implemented for inference
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(
-//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor(
-        bn_param_desc_, data_desc_,
-        test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
-#endif
-
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-
-    auto handle = dev_ctx.cudnn_handle();
-
-    // Now, depending on whether we are running test or not, we have two paths.
-    // It is training mode when it's not reference AND not using pre-trained
-    // model.
-    bool training = !test_mode && !use_global_stats;
-    if (!training) {
-      // only when test we use input to do computation.
-      const auto *est_mean = ctx.Input<Tensor>("Mean");
-      const auto *est_var = ctx.Input<Tensor>("Variance");
-      // Run inference mode.
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of mean's dimensions must equal to 1."
-              "But received: the size of mean's dimensions mean is [%d],"
-              "the dimensions of mean is [%s].",
-              est_mean->dims().size(), est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims().size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The size of variance's dimensions must equal to 1."
-              "But received: the size of variance's dimensions is [%d],"
-              "the dimensions of variance is [%s].",
-              est_var->dims().size(), est_var->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_mean->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of mean must equal to the number of "
-              "Channels, which is [%d]. But received: the first dimension"
-              "of mean is [%d], the dimensions of mean is [%s].",
-              C, est_mean->dims()[0], est_mean->dims()));
-      PADDLE_ENFORCE_EQ(
-          est_var->dims()[0], C,
-          platform::errors::InvalidArgument(
-              "The first dimension of variance must equal to the number"
-              "of Channels, which is [%d]. But received: the first dimension of"
-              "variance is [%d], the dimensions of variance is [%s].",
-              C, est_var->dims()[0], est_var->dims()));
-
-#ifdef PADDLE_WITH_HIP
-      const int block_size = 256;
-      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
-      if (compute_format == DataLayout::kNCHW) {
-        BNForwardInference<
-            T,
-            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      } else {
-        BNForwardInference<
-            T,
-            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
-            transformed_x.template data<T>(),
-            est_mean->template data<BatchNormParamType<T>>(),
-            est_var->template data<BatchNormParamType<T>>(),
-            scale->template data<BatchNormParamType<T>>(),
-            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-            epsilon, transformed_y.template data<T>());
-      }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardInference(
-//         handle, miopenBNSpatial,
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_mean->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             est_var->template data<BatchNormParamType<T>>())),
-//         epsilon));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnBatchNormalizationForwardInference(
-              handle,
-              // Note: PERSISTENT not implemented for inference
-              CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
-              CudnnDataType<T>::kZero(), data_desc_,
-              transformed_x.template data<T>(), data_desc_,
-              transformed_y.template mutable_data<T>(ctx.GetPlace()),
-              bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-              bias->template data<BatchNormParamType<T>>(),
-              est_mean->template data<BatchNormParamType<T>>(),
-              est_var->template data<BatchNormParamType<T>>(), epsilon));
-#endif
-    } else {
-      // if MomentumTensor is set, use MomentumTensor value, momentum
-      // is only used in this training branch
-      if (ctx.HasInput("MomentumTensor")) {
-        const auto *mom_tensor = ctx.Input<Tensor>("MomentumTensor");
-        Tensor mom_cpu;
-        paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
-                                          &mom_cpu);
-        momentum = mom_cpu.data<float>()[0];
-      }
-
-      // Run training mode.
-      // obtain running mean and running inv var, and there is no need
-      // to initialize them.
-
-      auto *mean_out = ctx.Output<Tensor>("MeanOut");
-      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
-      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-
-      if ((N * H * W * D) == 1) {
-        // Only 1 element in normalization dimension,
-        // skip the batch norm calculation, let y = x.
-        framework::TensorCopy(*x, ctx.GetPlace(), y);
-      } else {
-        double this_factor = 1. - momentum;
-
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        size_t reserve_space_size = 0;
-        void *reserve_space_ptr = nullptr;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        // Create reserve space and workspace for batch norm.
-        // Create tensor for each batchnorm op, it will be used in the
-        // backward. Thus this tensor shouldn't be temp.
-        auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
-        PADDLE_ENFORCE_NOT_NULL(
-            reserve_space,
-            platform::errors::NotFound(
-                "The argument ReserveSpace of batch_norm op is not found."));
-
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*zDesc=*/nullptr,
-                    /*yDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        // -------------- cudnn batchnorm reserve space --------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-                    /*handle=*/handle,
-                    /*mode=*/mode_,
-                    /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*activationDesc=*/nullptr,
-                    /*xDesc=*/data_desc_,
-                    /*sizeInBytes=*/&reserve_space_size));
-
-        reserve_space_ptr = reserve_space->mutable_data(
-            ctx.GetPlace(), transformed_x.type(), reserve_space_size);
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
-                handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType<T>::kOne(),
-                CudnnDataType<T>::kZero(), data_desc_,
-                transformed_x.template data<T>(), nullptr, nullptr, data_desc_,
-                transformed_y.template data<T>(), bn_param_desc_,
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), this_factor,
-                mean_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                variance_out->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                epsilon,
-                saved_mean->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                saved_variance->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                nullptr, workspace_ptr, workspace_size, reserve_space_ptr,
-                reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          const int num = transformed_x.numel();
-          const int block = 256;
-          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-          const int max_blocks = std::max(max_threads / block, 1);
-          const int grid = std::min(C, max_blocks);
-          if (compute_format == DataLayout::kNCHW) {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          } else {
-            BNForwardTraining<
-                T, block,
-                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(),
-                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
-                epsilon, this_factor, transformed_y.template data<T>(),
-                mean_out->template data<BatchNormParamType<T>>(),
-                variance_out->template data<BatchNormParamType<T>>(),
-                saved_mean->template data<BatchNormParamType<T>>(),
-                saved_variance->template data<BatchNormParamType<T>>());
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationForwardTraining(
-//         handle, mode_, const_cast<void *>(static_cast<const void *>(
-//                            CudnnDataType<T>::kOne())),
-//         const_cast<void *>(
-//             static_cast<const void *>(CudnnDataType<T>::kZero())),
-//         data_desc_,
-//         static_cast<const void *>(transformed_x.template data<T>()),
-//         data_desc_,
-//         static_cast<void *>(
-//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
-//         bn_param_desc_,
-//         const_cast<void *>(static_cast<const void *>(
-//             scale->template data<BatchNormParamType<T>>())),
-//         const_cast<void *>(static_cast<const void *>(
-//             bias->template data<BatchNormParamType<T>>())),
-//         this_factor,
-//         static_cast<void *>(
-//             mean_out->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(variance_out->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace())),
-//         epsilon,
-//         static_cast<void *>(
-//             saved_mean->template mutable_data<BatchNormParamType<T>>(
-//                 ctx.GetPlace())),
-//         static_cast<void *>(saved_variance->template mutable_data<
-//                             BatchNormParamType<T>>(ctx.GetPlace()))));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationForwardTraining(
-                  handle, mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_y.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  bias->template data<BatchNormParamType<T>>(), this_factor,
-                  mean_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  variance_out->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon,
-                  saved_mean->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  saved_variance->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace())));
-#endif
-        }
-      }
-    }
-
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-      TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-          ctx, &transformed_y, y);
-    }
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-    // clean when exit.
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
-    const T *dy, const T *x, const BatchNormParamType<T> *mean,
-    const BatchNormParamType<T> *variance, const double epsilon, const int N,
-    const int C, const int HxW, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
-    BatchNormParamType<T> mean_i = mean[i];
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
-                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale[i] = ds_sum * inv_var_i;
-      dbias[i] = db_sum;
-    }
-    __syncthreads();
-  }
-}
-
-template <typename T, framework::DataLayout layout>
-static __global__ void KeBNBackwardData(const T *dy,
-                                        const BatchNormParamType<T> *scale,
-                                        const BatchNormParamType<T> *variance,
-                                        const double epsilon, const int C,
-                                        const int HxW, const int num, T *dx) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
-    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
-    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
-                           scale[c] * inv_var);
-  }
-}
-
-template <typename T>
-static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x,
-                                       const BatchNormParamType<T> *scale,
-                                       const BatchNormParamType<T> *bias,
-                                       const BatchNormParamType<T> *mean,
-                                       const BatchNormParamType<T> *variance,
-                                       double epsilon, int C, int M,
-                                       const int num, const T *y) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = blockDim.x * gridDim.x;
-  for (int i = gid; i < num; i += stride) {
-    const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C;
-    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
-    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
-    x[i] = static_cast<T>(x_i);
-  }
-}
-
-template <typename T>
-class InplaceHelper {
- public:
-  void operator()(const framework::DataLayout layout, T *x,
-                  const BatchNormParamType<T> *scale,
-                  const BatchNormParamType<T> *bias,
-                  const BatchNormParamType<T> *mean,
-                  const BatchNormParamType<T> *variance, double epsilon, int C,
-                  int M, const int num, const T *y, int grid2, const int block,
-                  const gpuStream_t &stream) {
-    PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument(
-                                "X and Y should be inplaced in inplace mode"));
-    KeBNRestoreData<<<grid2, block, 0, stream>>>(
-        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
-  }
-};
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
-    const T *dy, const T *x, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *saved_mean,
-    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
-    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
-    BatchNormParamType<T> *dbias) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage ds_storage;
-  __shared__ typename BlockReduce::TempStorage db_storage;
-  __shared__ typename BlockReduce::TempStorage mean_storage;
-  __shared__ typename BlockReduce::TempStorage variance_storeage;
-  __shared__ BatchNormParamType<T> inv_var_val;
-  __shared__ BatchNormParamType<T> mean_val;
-  __shared__ BatchNormParamType<T> dscale_val;
-  __shared__ BatchNormParamType<T> dbias_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
-
-    if (saved_mean && saved_inv_variance) {
-      if (threadIdx.x == 0) {
-        inv_var_val = saved_inv_variance[i];
-        mean_val = saved_mean[i];
-      }
-    } else {
-      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
-      BatchNormParamType<T> x_square_sum =
-          static_cast<BatchNormParamType<T>>(0);
-
-      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-        const int index = layout == framework::DataLayout::kNCHW
-                              ? (j / HxW * C + i) * HxW + j % HxW
-                              : j * outer_size + i;
-        BatchNormParamType<T> x_i =
-            static_cast<BatchNormParamType<T>>(x[index]);
-        x_sum += x_i;
-        x_square_sum += x_i * x_i;
-      }
-      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
-      x_square_sum =
-          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
-      if (threadIdx.x == 0) {
-        mean_val = x_sum / inner_size;
-        inv_var_val =
-            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
-      }
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      ds_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
-      db_sum += dy_i;
-    }
-    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
-    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
-    if (threadIdx.x == 0) {
-      dscale_val = ds_sum * inv_var_val;
-      dbias_val = db_sum;
-      dscale[i] = dscale_val;
-      dbias[i] = dbias_val;
-    }
-    __syncthreads();
-
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] = scale[i] * inv_var_val *
-                  (static_cast<BatchNormParamType<T>>(dy[index]) -
-                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
-                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
-                       inv_var_val * dscale_val / inner_size);
-    }
-  }
-}
-
-template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
-    const T *dy, const BatchNormParamType<T> *scale,
-    const BatchNormParamType<T> *mean, const T *x,
-    const BatchNormParamType<T> *variance, const int C, const int N,
-    const int HxW, T *dx) {
-  const int outer_size = C;
-  const int inner_size = N * HxW;
-  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage dy_storage;
-  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
-  __shared__ BatchNormParamType<T> dy_sum_val;
-  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
-
-  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
-    BatchNormParamType<T> inv_var_i = variance[i];
-    BatchNormParamType<T> mean_i = mean[i];
-    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
-    BatchNormParamType<T> dy_x_sub_mean_sum =
-        static_cast<BatchNormParamType<T>>(0);
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      BatchNormParamType<T> dy_i =
-          static_cast<BatchNormParamType<T>>(dy[index]);
-      dy_sum += dy_i;
-      dy_x_sub_mean_sum +=
-          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
-    }
-
-    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
-    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
-                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
-
-    if (threadIdx.x == 0) {
-      dy_sum_val = dy_sum;
-      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
-    }
-    __syncthreads();
-    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
-      const int index = layout == framework::DataLayout::kNCHW
-                            ? (j / HxW * C + i) * HxW + j % HxW
-                            : j * outer_size + i;
-      dx[index] =
-          (static_cast<BatchNormParamType<T>>(dy[index]) -
-           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
-           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
-               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
-          scale[i] * inv_var_i;
-    }
-  }
-}
-
-template <typename T>
-class BatchNormGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("It must use CUDAPlace."));
-    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
-    const auto *scale = ctx.Input<Tensor>("Scale");
-    const auto *bias = ctx.Input<Tensor>("Bias");
-
-    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
-    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
-
-    // batch_norm with inplace as false will take X as grad input, which
-    // is same as cuDNN batch_norm backward calculation, batch_norm
-    // with inplace as true only take Y as input and X should be calculate
-    // by inverse operation of batch_norm on Y
-    const Tensor *x;
-    bool is_inplace;
-    if (ctx.HasInput("Y")) {
-      x = ctx.Input<Tensor>("Y");
-      is_inplace = true;
-      if (d_x) {
-        PADDLE_ENFORCE_EQ(d_x, d_y,
-                          platform::errors::InvalidArgument(
-                              "X@GRAD and Y@GRAD not inplace in inplace mode"));
-      }
-    } else {
-      x = ctx.Input<Tensor>("X");
-      is_inplace = false;
-      if (d_x) {
-        PADDLE_ENFORCE_NE(
-            d_x, d_y, platform::errors::InvalidArgument(
-                          "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
-      }
-    }
-
-    const bool is_test = ctx.Attr<bool>("is_test");
-    use_global_stats = is_test || use_global_stats;
-
-    const auto &x_dims = x->dims();
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size() >= 2 && x_dims.size() <= 5, true,
-        platform::errors::InvalidArgument(
-            "The size of input's dimensions should be between 2 and 5."
-            "But received: the size of input's dimensions is [%d],"
-            "the dimensions of input is [%s]",
-            x_dims.size(), x_dims));
-    int N, C, H, W, D;
-    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
-
-    // init output
-    if (d_x) {
-      d_x->mutable_data<T>(ctx.GetPlace());
-    }
-
-    if (d_scale && d_bias) {
-      d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-      d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    }
-    PADDLE_ENFORCE_EQ(
-        scale->dims().size(), 1UL,
-        platform::errors::InvalidArgument(
-            "The size of scale's dimensions must equal to 1. But received: "
-            "the size of scale's dimensions is [%d], the dimensions of scale "
-            "is [%s].",
-            scale->dims().size(), scale->dims()));
-    PADDLE_ENFORCE_EQ(
-        scale->dims()[0], C,
-        platform::errors::InvalidArgument(
-            "The first dimension of scale must equal to Channels[%d]. But "
-            "received: the first dimension of scale is [%d]",
-            C, scale->dims()[0]));
-
-    auto dtype = platform::CudnnDataType<T>::type;
-    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
-#ifdef PADDLE_WITH_HIP
-    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
-                                                           : DataLayout::kNCHW;
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// HIP do not support compute format of NHWC
-// auto compute_format = DataLayout::kNCHW;
-#else
-    const bool fast_nhwc_batch_norm =
-        dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
-        reserve_space != nullptr;
-    auto compute_format =
-        fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
-            ? DataLayout::kNHWC
-            : DataLayout::kNCHW;
-#endif
-
-    Tensor transformed_x(x->type());
-    Tensor transformed_d_y(d_y->type());
-    Tensor transformed_d_x;
-    if (data_layout == DataLayout::kNHWC &&
-        compute_format == DataLayout::kNCHW && x_dims.size() > 2) {
-      VLOG(3) << "Transform input tensor from NHWC to NCHW.";
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                           &transformed_x);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, x,
-                                                          &transformed_x);
-      ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                           &transformed_d_y);
-      TransToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_y,
-                                                          &transformed_d_y);
-      if (d_x) {
-        ResizeToChannelFirst<platform::CUDADeviceContext, T>(ctx, d_x,
-                                                             &transformed_d_x);
-      }
-    } else {
-      transformed_x.ShareDataWith(*x);
-      transformed_d_y.ShareDataWith(*d_y);
-      if (d_x) {
-        transformed_d_x.ShareDataWith(*d_x);
-      }
-    }
-
-    std::vector<int> dims;
-    std::vector<int> strides;
-    if (compute_format == DataLayout::kNCHW) {
-      dims = {N, C, H, W, D};
-      strides = {C * H * W * D, H * W * D, W * D, D, 1};
-    } else {
-      dims = {N, C, H, W, D};
-      strides = {H * W * C * D, 1, W * D * C, D * C, C};
-    }
-
-    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    const int num = transformed_x.numel();
-#ifdef HIPCC
-    const int block = 256;
-#else
-    const int block = 512;
-#endif
-    int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-    const int max_blocks = std::max(max_threads / block, 1);
-    int grid1 = (num + block - 1) / block;
-    int grid2 = std::min(C, max_blocks);
-    auto stream = dev_ctx.stream();
-    InplaceHelper<T> inplace_functor;
-
-    if (!use_global_stats) {
-      if ((N * H * W * D) == 1) {
-        if (d_x) {
-          framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-        }
-        phi::funcs::SetConstant<platform::CUDADeviceContext,
-                                BatchNormParamType<T>>
-            functor;
-        functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
-        functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
-        return;
-      }
-
-// ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// miopenTensorDescriptor_t data_desc_;
-// miopenTensorDescriptor_t bn_param_desc_;
-// miopenBatchNormMode_t mode_;
-
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
-#else
-      cudnnTensorDescriptor_t data_desc_;
-      cudnnTensorDescriptor_t bn_param_desc_;
-      cudnnBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
-#endif
-      if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
-        LOG(ERROR) << "Provided epsilon is smaller than "
-                   << "CUDNN_BN_MIN_EPSILON. Setting it to "
-                   << "CUDNN_BN_MIN_EPSILON instead.";
-      }
-      epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// mode_ = miopenBNSpatial;
-#elif CUDNN_VERSION_MIN(7, 0, 1)
-      if (FLAGS_cudnn_batchnorm_spatial_persistent) {
-        mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
-      } else if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#else
-      if (H == 1 && W == 1) {
-        mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
-      } else {
-        mode_ = CUDNN_BATCHNORM_SPATIAL;
-      }
-#endif  // CUDNN_VERSION_MIN(7, 0, 1)
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-//     data_desc_, CudnnDataType<T>::type,
-//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-//     const_cast<int *>(strides.data())));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-//                                                       data_desc_, mode_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
-                                                           data_desc_, mode_));
-#endif
-
-      const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
-      const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
-      const auto *saved_mean_data =
-          saved_mean->template data<BatchNormParamType<T>>();
-      const auto *saved_var_data =
-          saved_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        inplace_functor(compute_format, transformed_x.data<T>(),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        saved_mean_data, saved_var_data, epsilon, C, H * W * D,
-                        num, transformed_x.data<T>(), grid2, block, stream);
-      }
-
-      // This branch calls CUDNN APIs
-      if (d_x && d_scale && d_bias) {
-        bool called = false;
-#if CUDNN_VERSION_MIN(7, 4, 1)
-        called = true;
-        size_t workspace_size = 0;
-        void *workspace_ptr = nullptr;
-        Tensor workspace_tensor;
-        auto reserve_space_size = reserve_space->memory_size();
-        // --------------- cudnn batchnorm workspace ---------------
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::
-                cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-                    /*handle=*/dev_ctx.cudnn_handle(),
-                    /*mode=*/mode_,
-                    /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
-                    /*xDesc=*/data_desc_,
-                    /*yDesc=*/data_desc_,
-                    /*dyDesc=*/data_desc_,
-                    /*dzDesc=*/nullptr,
-                    /*dxDesc=*/data_desc_,
-                    /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
-                    /*activationDesc=*/nullptr,
-                    /*sizeInBytes=*/&workspace_size));
-
-        workspace_ptr = workspace_tensor.mutable_data(
-            ctx.GetPlace(), transformed_x.type(), workspace_size);
-
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::cudnnBatchNormalizationBackwardEx(
-                /*handle=*/dev_ctx.cudnn_handle(),
-                /*mode=*/mode_,
-                /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
-                /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
-                /*betaDataDiff=*/CudnnDataType<T>::kZero(),
-                /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
-                /*betaParamDiff=*/CudnnDataType<T>::kZero(),
-                /*xDesc=*/data_desc_,
-                /*xData=*/transformed_x.template data<T>(),
-                /*yDesc=*/nullptr,
-                /*yData=*/nullptr,
-                /*dyDesc=*/data_desc_,
-                /*dyData=*/transformed_d_y.template data<T>(),
-                /*dzDesc=*/nullptr,
-                /*dzData=*/nullptr,
-                /*dxDesc=*/data_desc_,
-                /*dxData=*/transformed_d_x.template mutable_data<T>(
-                    ctx.GetPlace()),
-                /*dBnScaleBiasDesc=*/bn_param_desc_,
-                /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
-                /*bnBiasData=*/nullptr,
-                /*dBnScaleData=*/d_scale
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*dBnBiasData=*/d_bias
-                    ->template mutable_data<BatchNormParamType<T>>(
-                        ctx.GetPlace()),
-                /*epsilon=*/epsilon,
-                /*savedMean=*/saved_mean_data,
-                /*savedInvVariance=*/saved_var_data,
-                /*activationDesc=*/nullptr,
-                /*workspace=*/workspace_ptr,
-                /*workSpaceSizeInBytes=*/workspace_size,
-                /*reserveSpace=*/const_cast<T *>(
-                    reserve_space->template data<T>()),
-                /*reserveSpaceSizeInBytes=*/reserve_space_size));
-#endif  // CUDNN_VERSION_MIN(7, 4, 1)
-        if (!called) {
-#ifdef PADDLE_WITH_HIP
-          if (compute_format == DataLayout::kNCHW) {
-            BNBackward<
-                T, block,
-                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          } else {
-            BNBackward<
-                T, block,
-                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
-                transformed_d_y.template data<T>(),
-                transformed_x.template data<T>(),
-                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
-                saved_var_data, C, N, H * W * D, epsilon,
-                transformed_d_x.template data<T>(),
-                d_scale->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()),
-                d_bias->template mutable_data<BatchNormParamType<T>>(
-                    ctx.GetPlace()));
-          }
-
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenBatchNormalizationBackward(
-//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-//         CudnnDataType<T>::kZero(), data_desc_,
-//         transformed_x.template data<T>(), data_desc_,
-//         transformed_d_y.template data<T>(), data_desc_,
-//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-//         d_scale->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         d_bias->template mutable_data<BatchNormParamType<T>>(
-//             ctx.GetPlace()),
-//         epsilon, saved_mean_data, saved_var_data));
-#else
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              platform::dynload::cudnnBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
-#endif
-        }
-
-        if (data_layout == DataLayout::kNHWC &&
-            compute_format == DataLayout::kNCHW) {
-          VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-          TransToChannelLast<paddle::platform::CUDADeviceContext, T>(
-              ctx, &transformed_d_x, d_x);
-        }
-      } else {
-        // This branch call CUDA kernels
-        if (compute_format == DataLayout::kNCHW) {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNCHW><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        } else {
-          if (d_x) {
-            BNBackwardData<T, block, framework::DataLayout::kNHWC><<<
-                grid2, block, 0, dev_ctx.stream()>>>(
-                d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-                saved_mean_data, x->data<T>(), saved_var_data, C, N, H * W * D,
-                d_x->data<T>());
-          }
-          if (d_scale && d_bias) {
-            KeBNBackwardScaleBias<
-                T, block,
-                framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-                d_y->data<T>(), x->data<T>(), saved_mean_data, saved_var_data,
-                epsilon, N, C, H * W * D,
-                d_scale->data<BatchNormParamType<T>>(),
-                d_bias->data<BatchNormParamType<T>>());
-          }
-        }
-      }
-
-#ifdef PADDLE_WITH_HIP
-// TODO(wangran16): wait for MIOpen to improve the performance of BN
-// clean when exit.
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-// PADDLE_ENFORCE_GPU_SUCCESS(
-//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
-#else
-      // clean when exit.
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
-#endif
-    } else {
-      const auto *running_mean = ctx.Input<Tensor>("Mean");
-      const auto *running_var = ctx.Input<Tensor>("Variance");
-
-      const auto *running_mean_data =
-          running_mean->template data<BatchNormParamType<T>>();
-      const auto *running_var_data =
-          running_var->template data<BatchNormParamType<T>>();
-
-      if (is_inplace) {
-        auto px = *x;
-        inplace_functor(data_layout, px.mutable_data<T>(ctx.GetPlace()),
-                        scale->template data<BatchNormParamType<T>>(),
-                        bias->template data<BatchNormParamType<T>>(),
-                        running_mean_data, running_var_data, epsilon, C,
-                        H * W * D, num, x->data<T>(), grid2, block, stream);
-      }
-
-      if (compute_format == DataLayout::kNCHW) {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      } else {
-        if (d_x) {
-          KeBNBackwardData<
-              T, framework::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
-              d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
-              running_var_data, epsilon, C, H * W, num, d_x->data<T>());
-        }
-        if (d_scale && d_bias) {
-          KeBNBackwardScaleBias<
-              T, block,
-              framework::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
-              d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
-              epsilon, N, C, H * W * D, d_scale->data<BatchNormParamType<T>>(),
-              d_bias->data<BatchNormParamType<T>>());
-        }
-      }
-    }
-  }
-};
-
-template <typename T>
-class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto *X = ctx.Input<Tensor>("X");
-    const auto *Scale = ctx.Input<Tensor>("Scale");
-    const auto *dY = ctx.Input<Tensor>("DY");
-    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
-    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
-    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
-    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
-    const bool is_test = ctx.Attr<bool>("is_test");
-
-    PADDLE_ENFORCE_EQ(
-        is_test, false,
-        platform::errors::InvalidArgument(
-            "`is_test = True` CANNOT be used in train program. If "
-            "you want to use global status in pre_train model, "
-            "please set `use_global_stats = True`"));
-
-    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
-    const DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-
-    const auto *ddX = ctx.Input<Tensor>("DDX");
-    const auto *ddScale = ctx.Input<Tensor>("DDScale");
-    const auto *ddBias = ctx.Input<Tensor>("DDBias");
-
-    auto *dX = ctx.Output<Tensor>("DX");
-    auto *dScale = ctx.Output<Tensor>("DScale");
-    auto *ddY = ctx.Output<Tensor>("DDY");
-
-    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
-        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
-        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-#ifdef PADDLE_WITH_HIP
-// MIOPEN do not support double
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>);
-#else
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm, ops::BatchNormKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
-    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    batch_norm_grad_grad,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
-    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
-#endif
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 6119af18ce153..b3ac3606eaf8e 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -32,7 +32,7 @@ namespace platform = paddle::platform;
 namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
-USE_OP(batch_norm);
+USE_OP_ITSELF(batch_norm);
 USE_CUDA_ONLY_OP(fused_bn_add_activation);
 USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc
index e0779249c41ad..7f5136969980b 100644
--- a/paddle/fluid/operators/inplace_abn_op.cc
+++ b/paddle/fluid/operators/inplace_abn_op.cc
@@ -17,6 +17,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker<T> {
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNKernel
-    : public paddle::operators::BatchNormKernel<DeviceContext, T> {
+class InplaceABNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* x = ctx.Input<Tensor>("X");
@@ -213,7 +214,33 @@ class InplaceABNKernel
     auto activation =
         GetInplaceABNActivationType(ctx.Attr<std::string>("activation"));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    BatchNormKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* variance = ctx.Input<Tensor>("Variance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* mean_out = ctx.Output<Tensor>("MeanOut");
+    auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+    auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+        is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+        mean_out, variance_out, saved_mean, saved_variance, reserve_space);
 
     auto cur_y = EigenVector<T>::Flatten(*y);
     InplaceABNActivation<DeviceContext, T> functor;
@@ -222,8 +249,7 @@ class InplaceABNKernel
 };
 
 template <typename DeviceContext, typename T>
-class InplaceABNGradKernel
-    : public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+class InplaceABNGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Input<Tensor>("Y");
@@ -244,7 +270,52 @@ class InplaceABNGradKernel
     InplaceABNActivation<DeviceContext, T> functor;
     functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy);
 
-    BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+    // BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+    auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+    auto momentum = ctx.Attr<float>("momentum");
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto data_layout = ctx.Attr<std::string>("data_layout");
+    auto is_test = ctx.Attr<bool>("is_test");
+    auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+    auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+    auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+    auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+    auto* mean = ctx.Input<Tensor>("ReserveSpace");
+    auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+    paddle::optional<const Tensor&> space_opt = paddle::none;
+    paddle::optional<const Tensor&> mean_opt = paddle::none;
+    paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+    if (reserve_space != nullptr) {
+      space_opt = *reserve_space;
+    }
+
+    if (mean != nullptr) {
+      mean_opt = *mean;
+    }
+
+    if (variance != nullptr) {
+      variance_opt = *variance;
+    }
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    phi::BatchNormGradRawKernel<T>(
+        static_cast<const typename framework::ConvertToPhiContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+        mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+        use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+        scale_grad, bias_grad);
   }
 };
 
diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu
index be7a7bd71711e..db8f8c72d13f8 100644
--- a/paddle/fluid/operators/inplace_abn_op.cu
+++ b/paddle/fluid/operators/inplace_abn_op.cu
@@ -15,14 +15,15 @@ limitations under the License. */
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/inplace_abn_op.h"
 #include "paddle/fluid/operators/sync_batch_norm_op.cu.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class InplaceABNKernel
-    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* y = ctx.Output<Tensor>("Y");
@@ -36,7 +37,33 @@ class InplaceABNKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      // BatchNormKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* mean = ctx.Input<Tensor>("Mean");
+      auto* variance = ctx.Input<Tensor>("Variance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* mean_out = ctx.Output<Tensor>("MeanOut");
+      auto* variance_out = ctx.Output<Tensor>("VarianceOut");
+      auto* saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Output<Tensor>("SavedVariance");
+      auto* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout,
+          is_test, use_global_stats, trainable_statistics, fuse_with_relu, y,
+          mean_out, variance_out, saved_mean, saved_variance, reserve_space);
     }
 
     auto cur_y = EigenVector<T>::Flatten(*y);
@@ -49,8 +76,7 @@ class InplaceABNKernel
 // https://kevinzakka.github.io/2016/09/14/batch_normalization/
 template <typename DeviceContext, typename T>
 class InplaceABNGradKernel
-    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T>,
-      public paddle::operators::BatchNormGradKernel<DeviceContext, T> {
+    : public paddle::operators::SyncBatchNormGradKernel<DeviceContext, T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* y = ctx.Input<Tensor>("Y");
@@ -74,7 +100,50 @@ class InplaceABNGradKernel
     if (ctx.Attr<bool>("use_sync_bn")) {
       SyncBatchNormGradKernel<DeviceContext, T>::Compute(ctx);
     } else {
-      BatchNormGradKernel<DeviceContext, T>::Compute(ctx);
+      auto* scale = ctx.Input<Tensor>("Scale");
+      auto* bias = ctx.Input<Tensor>("Bias");
+      auto* saved_mean = ctx.Input<Tensor>("SavedMean");
+      auto* saved_variance = ctx.Input<Tensor>("SavedVariance");
+
+      auto momentum = ctx.Attr<float>("momentum");
+      auto epsilon = ctx.Attr<float>("epsilon");
+      auto data_layout = ctx.Attr<std::string>("data_layout");
+      auto is_test = ctx.Attr<bool>("is_test");
+      auto use_global_stats = ctx.Attr<bool>("use_global_stats");
+      auto trainable_statistics = ctx.Attr<bool>("trainable_statistics");
+      auto fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
+
+      auto* scale_grad = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+      auto* bias_grad = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+      auto* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+      auto* mean = ctx.Input<Tensor>("ReserveSpace");
+      auto* variance = ctx.Input<Tensor>("ReserveSpace");
+
+      paddle::optional<const Tensor&> space_opt = paddle::none;
+      paddle::optional<const Tensor&> mean_opt = paddle::none;
+      paddle::optional<const Tensor&> variance_opt = paddle::none;
+
+      if (reserve_space != nullptr) {
+        space_opt = *reserve_space;
+      }
+
+      if (mean != nullptr) {
+        mean_opt = *mean;
+      }
+
+      if (variance != nullptr) {
+        variance_opt = *variance;
+      }
+
+      auto& dev_ctx = ctx.device_context<DeviceContext>();
+      phi::BatchNormGradRawKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt,
+          mean_opt, variance_opt, momentum, epsilon, data_layout, is_test,
+          use_global_stats, trainable_statistics, fuse_with_relu, true, d_x,
+          scale_grad, bias_grad);
     }
   }
 };
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index c400a8f4239a6..0ed1f2719de25 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal(
 }
 
 template <typename DeviceContext, typename T>
-void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
+void NormDoubleGradFunctor(const DeviceContext &ctx,
                            const DataLayout data_layout, const Tensor *X,
                            const Tensor *Scale, const Tensor *dY,
                            const Tensor *Saved_mean,
-                           const Tensor *Saved_variance, const double epsilon,
+                           const Tensor *Saved_variance, const Tensor *Mean,
+                           const Tensor *Variance, const double epsilon,
                            const bool use_global_stats, const Tensor *ddX,
                            const Tensor *ddScale, const Tensor *ddBias,
                            Tensor *dX, Tensor *dScale, Tensor *ddY) {
@@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data<T>());
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
-  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  phi::funcs::SetConstant<DeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
@@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   Tensor scale_tmp;
   if (!Scale) {
     scale_tmp.mutable_data<T>({C}, ctx.GetPlace());
-    set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
+    set_constant(ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
 #ifdef __HIPCC__
@@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
 #else
   const int block = 512;
 #endif
-  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
   int grid1 = (num + block - 1) / block;
 
   const T *mean_data, *variance_data;
   if (use_global_stats) {
-    const auto *running_mean = ctx.Input<Tensor>("Mean");
-    const auto *running_var = ctx.Input<Tensor>("Variance");
+    const auto *running_mean = Mean;
+    const auto *running_var = Variance;
     const auto *running_mean_data = running_mean->template data<T>();
     const auto *running_var_data = running_var->template data<T>();
     mean_data = running_mean_data;
@@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   } else {
     const T *smean_data = Saved_mean->data<T>();
     const T *svariance_data = Saved_variance->data<T>();
+
     mean_data = smean_data;
     variance_data = svariance_data;
   }
 
   if (dX) {
     T *dx_data = dX->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dX, static_cast<T>(0));
+    set_constant(ctx, dX, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       } else {
         DoubleGradComputeDXWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num,
             dx_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       } else {
         DoubleGradComputeDX<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, scale_data,
             ddscale_data, N, C, sample_size, epsilon, dx_data);
       }
@@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (dScale) {
     T *dscale_data = dScale->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, dScale, static_cast<T>(0));
+    set_constant(ctx, dScale, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       } else {
         DoubleGradComputeDScaleWithGlobal<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             ddx_data, variance_data, dy_data, epsilon, N, C, sample_size,
             dscale_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       } else {
         DoubleGradComputeDScale<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddx_data, dy_data, N, C,
             sample_size, epsilon, dscale_data);
       }
@@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   }
   if (ddY) {
     T *ddy_data = ddY->mutable_data<T>(ctx.GetPlace());
-    set_constant(dev_ctx, ddY, static_cast<T>(0));
+    set_constant(ctx, ddY, static_cast<T>(0));
     if (use_global_stats) {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNHWC><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNHWC><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       } else {
         DoubleGradComputeDDYWithGlobal<
-            T, DataLayout::kNCHW><<<grid1, block, 0, dev_ctx.stream()>>>(
+            T, DataLayout::kNCHW><<<grid1, block, 0, ctx.stream()>>>(
             ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data,
             ddscale_data, epsilon, C, sample_size, num, ddy_data);
       }
     } else {
       if (data_layout == DataLayout::kNHWC) {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       } else {
         DoubleGradComputeDDY<
-            T, block, DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+            T, block, DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
             x_data, mean_data, variance_data, ddscale_data, ddbias_data,
             ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data);
       }
diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h
new file mode 100644
index 0000000000000..c15dbd2f63f58
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_grad_kernel.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& dev_ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad);
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& dev_ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h
new file mode 100644
index 0000000000000..7ddf32e27c7d7
--- /dev/null
+++ b/paddle/phi/kernels/batch_norm_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..de2343a384a5b
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -0,0 +1,674 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context& ctx,
+                            const DenseTensor& y_grad,
+                            const DenseTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            paddle::optional<const DenseTensor&> reserve_space,
+                            paddle::optional<const DenseTensor&> mean,
+                            paddle::optional<const DenseTensor&> variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  const auto* d_y = &y_grad;
+
+  DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  auto* d_x = x_grad;
+  auto* d_scale = scale_grad;
+  auto* d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  // batch_norm with inplace as false will take X as grad input, which
+  // is same as cuDNN batch_norm backward calculation, batch_norm
+  // with inplace as true only take Y as input and X should be calculate
+  // by inverse operation of batch_norm on Y
+
+  if (is_inplace) {
+    if (d_x) {
+      PADDLE_ENFORCE_EQ(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  } else {
+    if (d_x) {
+      PADDLE_ENFORCE_NE(d_x,
+                        d_y,
+                        phi::errors::InvalidArgument(
+                            "X@GRAD and Y@GRAD inplaced in non-inplace mode"));
+    }
+  }
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded as
+  // NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  const T* mean_data = saved_mean.data<T>();
+  const T* inv_var_data = saved_variance.data<T>();
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  T* d_bias_data = nullptr;
+  T* d_scale_data = nullptr;
+  if (d_scale && d_bias) {
+    d_bias_data = ctx.template Alloc<T>(d_bias);
+    d_scale_data = ctx.template Alloc<T>(d_scale);
+  }
+
+  // d_bias = np.sum(d_y, axis=0)
+  // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+  // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+  //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+  EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
+  EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
+
+  if (d_scale && d_bias) {
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+  }
+
+  if (d_x && (N * sample_size) == 1 && !use_global_stats) {
+    paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+    return;
+  }
+
+  int scale_coefff = use_global_stats ? 1 : N * sample_size;
+  const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
+
+  DenseTensor dy_sum;
+  dy_sum.Resize({C});
+  auto dy_sum_data = ctx.template Alloc<T>(&dy_sum);
+  EigenVectorArrayMap<T> dy_sum_arr(dy_sum_data, C);
+
+  DenseTensor dy_mul_x_sub_mean_mul_invstd_sum;
+  dy_mul_x_sub_mean_mul_invstd_sum.Resize({C});
+  auto dy_mul_x_sub_mean_mul_invstd_sum_data =
+      ctx.template Alloc<T>(&dy_mul_x_sub_mean_mul_invstd_sum);
+  EigenVectorArrayMap<T> dy_mul_x_sub_mean_mul_invstd_sum_arr(
+      dy_mul_x_sub_mean_mul_invstd_sum_data, C);
+
+  dy_sum_arr.setZero();
+  dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero();
+
+  // inplace calculation
+  // Y:  ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  // X: (y - bias) / scale / (inv_var) + est_mean
+  //   formula transform ====>
+  //    (y - bias) / (scale * inv_var) + est_mean
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), sample_size, N * C);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) /
+                               scale_inv_var_nhw(nc % C) / scale_coefff +
+                           mean_arr(nc % C);
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+
+      for (int nc = 0; nc < N * C; ++nc) {
+        int c = nc % C;
+        dy_sum_arr(c) += d_y_arr.col(nc).sum();
+        dy_mul_x_sub_mean_mul_invstd_sum_arr(c) +=
+            ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                .sum();
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), sample_size, N * C);
+        if (!use_global_stats) {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) =
+                scale_inv_var_nhw(c) *
+                (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) -
+                 (x_arr.col(nc) - mean_arr[c]) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c));
+          }
+        } else {
+          for (int nc = 0; nc < N * C; ++nc) {
+            int c = nc % C;
+            d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc);
+          }
+        }
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      if (is_inplace) {
+        auto px = x;
+        EigenArrayMap<T> x_data(ctx.template Alloc<T>(&px), C, N * sample_size);
+        ConstEigenArrayMap<T> y_data(x.data<T>(), C, N * sample_size);
+        for (int nhw = 0; nhw < N * sample_size; nhw++) {
+          x_data.col(nhw) =
+              (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff +
+              mean_arr;
+        }
+      }
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+      ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+
+      for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+        dy_sum_arr += d_y_arr.col(nhw);
+        dy_mul_x_sub_mean_mul_invstd_sum_arr +=
+            (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+      }
+
+      if (d_scale && d_bias) {
+        d_bias_arr = dy_sum_arr;
+        d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr;
+      }
+
+      if (d_x) {
+        EigenArrayMap<T> d_x_arr(
+            ctx.template Alloc<T>(d_x), C, N * sample_size);
+        if (!use_global_stats) {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) =
+                scale_inv_var_nhw *
+                (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr -
+                 (x_arr.col(nhw) - mean_arr) *
+                     dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr);
+          }
+        } else {
+          for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+            d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw);
+          }
+        }
+      }
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                data_layout_str));
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& y_grad,
+                         const DenseTensor& x,
+                         const DenseTensor& scale,
+                         const DenseTensor& bias,
+                         const DenseTensor& saved_mean,
+                         const DenseTensor& saved_variance,
+                         paddle::optional<const DenseTensor&> reserve_space,
+                         paddle::optional<const DenseTensor&> mean,
+                         paddle::optional<const DenseTensor&> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string& data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor* x_grad,
+                         DenseTensor* scale_grad,
+                         DenseTensor* bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context& ctx,
+                               const DenseTensor& x_grad_grad,
+                               const DenseTensor& scale_grad_grad,
+                               const DenseTensor& bias_grad_grad,
+                               const DenseTensor& y_grad,
+                               const DenseTensor& x,
+                               const DenseTensor& scale,
+                               const DenseTensor& saved_mean,
+                               const DenseTensor& saved_variance,
+                               paddle::optional<const DenseTensor&> mean,
+                               paddle::optional<const DenseTensor&> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string& data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor* x_grad,
+                               DenseTensor* scale_grad,
+                               DenseTensor* y_grad_grad) {
+  const auto* X = &x;
+  const auto* Scale = &scale;
+  const auto* dY = &y_grad;
+  const auto* Saved_mean = &saved_mean;
+  const auto* Saved_variance = &saved_variance;
+
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const auto data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto* ddX = &x_grad_grad;
+  const auto* ddScale = &scale_grad_grad;
+  const auto* ddBias = &bias_grad_grad;
+
+  auto* dX = x_grad;
+  auto* dScale = scale_grad;
+  auto* ddY = y_grad_grad;
+  ctx.template Alloc<T>(dX);
+  ctx.template Alloc<T>(ddY);
+
+  const auto& x_dims = X->dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = X->numel() / C;
+  phi::funcs::SetConstant<Context, T> set_constant;
+
+  const T* mean_data = Saved_mean->data<T>();
+  const T* inv_var_data = Saved_variance->data<T>();
+
+  DenseTensor inv_var_tensor;
+  if (use_global_stats) {
+    const auto* running_mean = mean.get_ptr();
+    const auto* running_variance = variance.get_ptr();
+    mean_data = running_mean->data<T>();
+    inv_var_tensor.Resize({C});
+
+    T* running_inv_var_data = ctx.template Alloc<T>(&inv_var_tensor);
+    EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+    ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+    inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+    inv_var_data = running_inv_var_data;
+  }
+
+  // transpose NCHW -> NHWC for easy calculate
+  DenseTensor transformed_x(X->type());
+  DenseTensor transformed_dy(dY->type());
+  DenseTensor transformed_ddx(ddX->type());
+
+  DenseTensor transformed_dx(dX->type());
+  DenseTensor transformed_ddy(ddY->type());
+  if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    // Input Tensor
+    ResizeToChannelLast<Context, T>(ctx, X, &transformed_x);
+    TransToChannelLast<Context, T>(ctx, X, &transformed_x);
+    ResizeToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    TransToChannelLast<Context, T>(ctx, dY, &transformed_dy);
+    ResizeToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    TransToChannelLast<Context, T>(ctx, ddX, &transformed_ddx);
+    // Output Tensor
+    ResizeToChannelLast<Context, T>(ctx, dX, &transformed_dx);
+    ResizeToChannelLast<Context, T>(ctx, ddY, &transformed_ddy);
+  } else {
+    transformed_x.ShareDataWith(*X);
+    transformed_dy.ShareDataWith(*dY);
+    transformed_ddx.ShareDataWith(*ddX);
+
+    transformed_dx.ShareDataWith(*dX);
+    transformed_ddy.ShareDataWith(*ddY);
+  }
+
+  ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+  ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+  ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+  Tensor mean_tile;
+  mean_tile.Resize({C, sample_size});
+  EigenArrayMap<T> mean_tile_data(
+      ctx.template Alloc<T>(&mean_tile), C, sample_size);
+
+  DenseTensor inv_var_tile;
+  inv_var_tile.Resize({C, sample_size});
+  EigenArrayMap<T> inv_var_tile_data(
+      ctx.template Alloc<T>(&inv_var_tile), C, sample_size);
+
+  mean_tile_data = mean_arr.replicate(1, sample_size);
+  inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+  DenseTensor Scale_data;
+  if (!Scale) {
+    Scale_data.Resize({C});
+    ctx.template Alloc<T>(&Scale_data);
+    set_constant(ctx, &Scale_data, static_cast<T>(1));
+  }
+  ConstEigenVectorArrayMap<T> scale_arr(
+      Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+  Tensor scale_tile;
+  scale_tile.Resize({C, sample_size});
+  EigenArrayMap<T> scale_tile_data(
+      ctx.template Alloc<T>(&scale_tile), C, sample_size);
+  scale_tile_data = scale_arr.replicate(1, sample_size);
+
+  ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+  ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+  DenseTensor x_sub_mean_mul_invstd;
+  x_sub_mean_mul_invstd.Resize({C, sample_size});
+
+  EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+      ctx.template Alloc<T>(&x_sub_mean_mul_invstd), C, sample_size);
+  x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+  if (dX) {
+    ctx.template Alloc<T>(dX);
+    EigenArrayMap<T> dx_arr(
+        ctx.template Alloc<T>(&transformed_dx), C, sample_size);
+    dx_arr.setZero();
+    if (use_global_stats) {
+      // math: dx = (ddscale * dy) * inv_var
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+      }
+    } else {
+      // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+      // axis=(n,h,w)) *
+      //          np.sum(dy, axis=(n,h,w)) -
+      //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+      //          mean),
+      //          axis=(n,h,w)) * inv_var.pow(2) *
+      //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+      //          NxHxW *
+      //          np.sum(ddx * (x - mean)) *
+      //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+      //          np.sum(dy,
+      //          axis=(n,h,w)) * (x - mean) *
+      //          (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var -
+      //          inv_var
+      //          *
+      //          np.mean(dy, axis=(n,h,w)) -
+      //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+      //          axis=(n,h,w)))
+
+      if (ddX) {
+        dx_arr +=
+            (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data /
+             sample_size)
+                .colwise() *
+            (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+             (dy_arr * ddx_arr).rowwise().sum() +
+             3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                 sample_size);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+        dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                  (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                  sample_size *
+                  (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+        dx_arr = scale_tile_data * dx_arr;
+      }
+      if (ddScale) {
+        ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+        Tensor ddscale_tile;
+        ddscale_tile.Resize({C, sample_size});
+        EigenArrayMap<T> ddscale_tile_data(
+            ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+        ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+        dx_arr +=
+            (dy_arr * inv_var_tile_data -
+             (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) *
+                 inv_var_tile_data -
+             x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size) *
+            ddscale_tile_data;
+      }
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_dx, dX);
+    }
+  }
+  if (dScale) {
+    EigenVectorArrayMap<T> dscale_arr(ctx.template Alloc<T>(dScale), C);
+    dscale_arr.setZero();
+    if (use_global_stats) {
+      // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+      if (ddX) {
+        dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+      }
+    } else {
+      // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+      //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+      //            ddx
+      if (ddX) {
+        Tensor first_grad;
+        first_grad.Resize({C, sample_size});
+        EigenArrayMap<T> first_grad_arr(
+            ctx.template Alloc<T>(&first_grad), C, sample_size);
+        first_grad_arr.setZero();
+
+        first_grad_arr +=
+            inv_var_tile_data *
+            (dy_arr -
+             dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+        dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+      }
+    }
+  }
+
+  if (ddY) {
+    ctx.template Alloc<T>(ddY);
+    EigenArrayMap<T> ddy_arr(
+        ctx.template Alloc<T>(&transformed_ddy), C, sample_size);
+    ddy_arr.setZero();
+    if (use_global_stats) {
+      // math: ddy = r * ddx * inv_var + ddbias +
+      //           ddscale * (x - mean) * inv_var
+      if (ddX) {
+        ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+      }
+    } else {
+      // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+      //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+      //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+      if (ddX) {
+        ddy_arr +=
+            scale_tile_data * inv_var_tile_data *
+            (ddx_arr -
+             ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+             x_sub_mean_mul_invstd_arr *
+                 (ddx_arr * x_sub_mean_mul_invstd_arr)
+                     .rowwise()
+                     .sum()
+                     .replicate(1, sample_size) /
+                 sample_size);
+      }
+    }
+    if (ddScale) {
+      ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+      Tensor ddscale_tile;
+      ddscale_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddscale_tile_data(
+          ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
+      ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+      ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+    }
+
+    if (ddBias) {
+      ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+      Tensor ddbias_tile;
+      ddbias_tile.Resize({C, sample_size});
+      EigenArrayMap<T> ddbias_tile_data(
+          ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
+      ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+      ddy_arr += ddbias_tile_data;
+    }
+
+    if (data_layout == DataLayout::kNCHW) {
+      VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+      TransToChannelFirst<Context, T>(ctx, &transformed_ddy, ddY);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) {
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
new file mode 100644
index 0000000000000..743128e8dea99
--- /dev/null
+++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+
+namespace phi {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space) {
+  bool test_mode = is_test && (!trainable_statistics);
+
+  bool global_stats = test_mode || use_global_stats;
+
+  auto data_layout = paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto& x_dims = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dims.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be larger than 1."
+          "But received: the size of input X's dimensions is [%d]",
+          x_dims.size()));
+  PADDLE_ENFORCE_LE(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument(
+          "The size of input X's dimensions should be less than 6."
+          "But received: the size of input X's dimensionss is [%d]",
+          x_dims.size()));
+  const int N = x_dims[0];
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int sample_size = x.numel() / N / C;
+
+  // alloc memory
+  ctx.template Alloc<T>(y);
+  ctx.template Alloc<T>(mean_out);
+  ctx.template Alloc<T>(variance_out);
+  ctx.template Alloc<T>(saved_mean);
+  ctx.template Alloc<T>(saved_variance);
+
+  // input dimension is 2 and the format is NCHW. The input can be regarded
+  // as NHWC format
+  if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) {
+    data_layout = DataLayout::kNHWC;
+  }
+
+  if (!global_stats) {
+    // saved_xx is use just in this batch of data
+    EigenVectorArrayMap<T> saved_mean_e(ctx.template Alloc<T>(saved_mean), C);
+    EigenVectorArrayMap<T> saved_variance_e(
+        ctx.template Alloc<T>(saved_variance), C);
+    saved_mean_e.setZero();
+    saved_variance_e.setZero();
+
+    EigenVectorArrayMap<T> running_mean_arr(ctx.template Alloc<T>(mean_out), C);
+    EigenVectorArrayMap<T> running_var_arr(ctx.template Alloc<T>(variance_out),
+                                           C);
+
+    if ((N * sample_size) == 1) {
+      // Only 1 element in normalization dimension,
+      // we skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+      return;
+    }
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_mean_e(nc % C) += x_arr.col(nc).sum();
+        }
+        saved_mean_e /= N * sample_size;
+        for (int nc = 0; nc < N * C; ++nc) {
+          saved_variance_e(nc % C) +=
+              (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x.data<T>(), C, N * sample_size);
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_mean_e += x_arr.col(i);
+        }
+        saved_mean_e /= N * sample_size;
+        for (int i = 0; i < N * sample_size; ++i) {
+          saved_variance_e +=
+              (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+        }
+        saved_variance_e /= N * sample_size;
+        break;
+      }
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s",
+                                                  data_layout_str));
+    }
+
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    running_mean_arr =
+        running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+    running_var_arr =
+        running_var_arr * momentum + saved_variance_e * (1. - momentum);
+  }
+
+  // use SavedMean and SavedVariance to do normalize
+  Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+  if (global_stats) {
+    ConstEigenVectorArrayMap<T> var_arr(variance.data<T>(), C);
+    inv_std = (var_arr + epsilon).sqrt().inverse();
+  } else {
+    EigenVectorArrayMap<T> saved_inv_std(saved_variance->data<T>(), C);
+    // inverse SavedVariance first, gradient will use it too.
+    saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+    inv_std = saved_inv_std;
+  }
+  ConstEigenVectorArrayMap<T> mean_arr(
+      global_stats ? mean.data<T>() : saved_mean->data<T>(), C);
+
+  //   ((x - est_mean) * (inv_var) * scale + bias
+  //   formula transform ====>
+  //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+  ConstEigenVectorArrayMap<T> scale_arr(scale.data<T>(), C);
+  ConstEigenVectorArrayMap<T> bias_arr(bias.data<T>(), C);
+  Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+  Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+      bias_arr - mean_arr * inv_std * scale_arr;
+
+  switch (data_layout) {
+    case DataLayout::kNCHW: {
+      EigenArrayMap<T> y_arr(ctx.template Alloc<T>(y), sample_size, N * C);
+      ConstEigenArrayMap<T> x_arr(x.data<T>(), sample_size, N * C);
+      for (int nc = 0; nc < N * C; ++nc) {
+        y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+      }
+      break;
+    }
+    case DataLayout::kNHWC: {
+      EigenArrayMap<T>(ctx.template Alloc<T>(y), C, N * sample_size) =
+          (ConstEigenArrayMap<T>(x.data<T>(), C, N * sample_size).colwise() *
+           new_scale)
+              .colwise() +
+          new_bias;
+      break;
+    }
+    default:
+      PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d",
+                                                data_layout));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..2c9ee5ede0103
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -0,0 +1,1038 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance,
+    const double epsilon,
+    const int N,
+    const int C,
+    const int HxW,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
+    BatchNormParamType<T> mean_i = mean[i];
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
+                (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+      db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[i] = ds_sum * inv_var_i;
+      dbias[i] = db_sum;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void KeBNBackwardData(const T *dy,
+                                        const BatchNormParamType<T> *scale,
+                                        const BatchNormParamType<T> *variance,
+                                        const double epsilon,
+                                        const int C,
+                                        const int HxW,
+                                        const int num,
+                                        T *dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
+    dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
+                           scale[c] * inv_var);
+  }
+}
+
+template <typename T>
+static __global__ void KeBNRestoreData(const phi::DataLayout layout,
+                                       T *x,
+                                       const BatchNormParamType<T> *scale,
+                                       const BatchNormParamType<T> *bias,
+                                       const BatchNormParamType<T> *mean,
+                                       const BatchNormParamType<T> *variance,
+                                       double epsilon,
+                                       int C,
+                                       int M,
+                                       const int num,
+                                       const T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C;
+    auto y_i = static_cast<BatchNormParamType<T>>(y[i]);
+    auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c];
+    x[i] = static_cast<T>(x_i);
+  }
+}
+
+template <typename T>
+class InplaceHelper {
+ public:
+  void operator()(const phi::DataLayout layout,
+                  T *x,
+                  const BatchNormParamType<T> *scale,
+                  const BatchNormParamType<T> *bias,
+                  const BatchNormParamType<T> *mean,
+                  const BatchNormParamType<T> *variance,
+                  double epsilon,
+                  int C,
+                  int M,
+                  const int num,
+                  const T *y,
+                  int grid2,
+                  const int block,
+                  const gpuStream_t &stream) {
+    PADDLE_ENFORCE_EQ(x,
+                      y,
+                      phi::errors::InvalidArgument(
+                          "X and Y should be inplaced in inplace mode"));
+    KeBNRestoreData<<<grid2, block, 0, stream>>>(
+        layout, x, scale, bias, mean, variance, epsilon, C, M, num, y);
+  }
+};
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy,
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *dx,
+    BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == phi::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean,
+    const T *x,
+    const BatchNormParamType<T> *variance,
+    const int C,
+    const int N,
+    const int HxW,
+    T *dx) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> inv_var_i = variance[i];
+    BatchNormParamType<T> mean_i = mean[i];
+    BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> dy_x_sub_mean_sum =
+        static_cast<BatchNormParamType<T>>(0);
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      dy_sum += dy_i;
+      dy_x_sub_mean_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
+    }
+
+    dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+    dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage)
+                            .Reduce(dy_x_sub_mean_sum, cub::Sum());
+
+    if (threadIdx.x == 0) {
+      dy_sum_val = dy_sum;
+      dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+    }
+    __syncthreads();
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] =
+          (static_cast<BatchNormParamType<T>>(dy[index]) -
+           dy_sum_val / static_cast<BatchNormParamType<T>>(inner_size) -
+           (static_cast<BatchNormParamType<T>>(x[index]) - mean_i) *
+               dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) *
+          scale[i] * inv_var_i;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradRawKernel(const Context &ctx,
+                            const DenseTensor &y_grad,
+                            const DenseTensor &x,
+                            const DenseTensor &scale,
+                            const DenseTensor &bias,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            paddle::optional<const DenseTensor &> reserve_space,
+                            paddle::optional<const DenseTensor &> mean,
+                            paddle::optional<const DenseTensor &> variance,
+                            float momentum,
+                            float epsilon_f,
+                            const std::string &data_layout_str,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            bool is_inplace,
+                            DenseTensor *x_grad,
+                            DenseTensor *scale_grad,
+                            DenseTensor *bias_grad) {
+  double epsilon = static_cast<double>(epsilon_f);
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const auto *d_y = &y_grad;
+
+  auto *d_x = x_grad;
+  auto *d_scale = scale_grad;
+  auto *d_bias = bias_grad;
+
+  use_global_stats = is_test || use_global_stats;
+
+  const auto &x_dims = x.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5."
+          "But received: the size of input's dimensions is [%d],"
+          "the dimensions of input is [%s]",
+          x_dims.size(),
+          x_dims));
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  // init output
+  if (d_x) {
+    ctx.template Alloc<T>(d_x);
+  }
+
+  if (d_scale && d_bias) {
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      scale.dims().size(),
+      1UL,
+      phi::errors::InvalidArgument(
+          "The size of scale's dimensions must equal to 1. But received: "
+          "the size of scale's dimensions is [%d], the dimensions of scale "
+          "is [%s].",
+          scale.dims().size(),
+          scale.dims()));
+  PADDLE_ENFORCE_EQ(
+      scale.dims()[0],
+      C,
+      phi::errors::InvalidArgument(
+          "The first dimension of scale must equal to Channels[%d]. But "
+          "received: the first dimension of scale is [%d]",
+          C,
+          scale.dims()[0]));
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF &&
+                                    FLAGS_cudnn_batchnorm_spatial_persistent &&
+                                    (reserve_space.get_ptr() != nullptr);
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_d_y(d_y->type());
+  DenseTensor transformed_d_x;
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    TransToChannelFirst<Context, T>(ctx, d_y, &transformed_d_y);
+    if (d_x) {
+      ResizeToChannelFirst<Context, T>(ctx, d_x, &transformed_d_x);
+    }
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_d_y.ShareDataWith(*d_y);
+    if (d_x) {
+      transformed_d_x.ShareDataWith(*d_x);
+    }
+  }
+
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * C * D, 1, W * D * C, D * C, C};
+  }
+
+  const int num = transformed_x.numel();
+#ifdef HIPCC
+  const int block = 256;
+#else
+  const int block = 512;
+#endif
+  int max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  int grid1 = (num + block - 1) / block;
+  int grid2 = std::min(C, max_blocks);
+  auto stream = ctx.stream();
+  InplaceHelper<T> inplace_functor;
+
+  if (!use_global_stats) {
+    if ((N * H * W * D) == 1) {
+      if (d_x) {
+        paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
+      }
+      phi::funcs::SetConstant<Context, BatchNormParamType<T>> functor;
+      functor(ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
+      functor(ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
+      return;
+    }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnCreateTensorDescriptor(
+            &bn_param_desc_));
+#endif
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+    if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+      mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+    } else if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#else
+    if (H == 1 && W == 1) {
+      mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+    } else {
+      mode_ = CUDNN_BATCHNORM_SPATIAL;
+    }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+            data_desc_,
+            CudnnDataType<T>::type,
+            x_dims.size() > 3 ? x_dims.size() : 4,
+            dims.data(),
+            strides.data()));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+            bn_param_desc_, data_desc_, mode_));
+#endif
+
+    const auto *saved_mean_data =
+        saved_mean.template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_variance.template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      inplace_functor(compute_format,
+                      transformed_x.data<T>(),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      saved_mean_data,
+                      saved_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      transformed_x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    // This branch calls CUDNN APIs
+    if (d_x && d_scale && d_bias) {
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      auto reserve_space_size = reserve_space->memory_size();
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+                  /*handle=*/ctx.cudnn_handle(),
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*yDesc=*/data_desc_,
+                  /*dyDesc=*/data_desc_,
+                  /*dzDesc=*/nullptr,
+                  /*dxDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationBackwardEx(
+              /*handle=*/ctx.cudnn_handle(),
+              /*mode=*/mode_,
+              /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+              /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+              /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+              /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+              /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+              /*xDesc=*/data_desc_,
+              /*xData=*/transformed_x.template data<T>(),
+              /*yDesc=*/nullptr,
+              /*yData=*/nullptr,
+              /*dyDesc=*/data_desc_,
+              /*dyData=*/transformed_d_y.template data<T>(),
+              /*dzDesc=*/nullptr,
+              /*dzData=*/nullptr,
+              /*dxDesc=*/data_desc_,
+              /*dxData=*/ctx.template Alloc<T>(&transformed_d_x),
+              /*dBnScaleBiasDesc=*/bn_param_desc_,
+              /*bnScaleData=*/scale.template data<BatchNormParamType<T>>(),
+              /*bnBiasData=*/nullptr,
+              /*dBnScaleData=*/d_scale
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*dBnBiasData=*/d_bias
+                  ->template mutable_data<BatchNormParamType<T>>(
+                      ctx.GetPlace()),
+              /*epsilon=*/epsilon,
+              /*savedMean=*/saved_mean_data,
+              /*savedInvVariance=*/saved_var_data,
+              /*activationDesc=*/nullptr,
+              /*workspace=*/workspace_ptr,
+              /*workSpaceSizeInBytes=*/workspace_size,
+              /*reserveSpace=*/const_cast<T *>(
+                  reserve_space->template data<T>()),
+              /*reserveSpaceSizeInBytes=*/reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        if (compute_format == DataLayout::kNCHW) {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        } else {
+          BNBackward<T,
+                     block,
+                     DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              transformed_d_y.template data<T>(),
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              transformed_d_x.template data<T>(),
+              d_scale->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              d_bias->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()));
+        }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationBackward(
+                ctx.cudnn_handle(),
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                transformed_d_y.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_d_x),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean_data,
+                saved_var_data));
+#endif
+      }
+
+      if (data_layout == DataLayout::kNHWC &&
+          compute_format == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+        TransToChannelLast<Context, T>(ctx, &transformed_d_x, d_x);
+      }
+    } else {
+      // This branch call CUDA kernels
+      if (compute_format == DataLayout::kNCHW) {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      } else {
+        if (d_x) {
+          BNBackwardData<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, ctx.stream()>>>(
+              d_y->data<T>(),
+              scale.data<BatchNormParamType<T>>(),
+              saved_mean_data,
+              x.data<T>(),
+              saved_var_data,
+              C,
+              N,
+              H * W * D,
+              d_x->data<T>());
+        }
+        if (d_scale && d_bias) {
+          KeBNBackwardScaleBias<
+              T,
+              block,
+              phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+              d_y->data<T>(),
+              x.data<T>(),
+              saved_mean_data,
+              saved_var_data,
+              epsilon,
+              N,
+              C,
+              H * W * D,
+              d_scale->data<BatchNormParamType<T>>(),
+              d_bias->data<BatchNormParamType<T>>());
+        }
+      }
+    }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+    // clean when exit.
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
+            bn_param_desc_));
+#endif
+  } else {
+    const auto *running_mean = mean.get_ptr();
+    const auto *running_var = variance.get_ptr();
+
+    const auto *running_mean_data =
+        running_mean->template data<BatchNormParamType<T>>();
+    const auto *running_var_data =
+        running_var->template data<BatchNormParamType<T>>();
+
+    if (is_inplace) {
+      auto px = x;
+      inplace_functor(data_layout,
+                      ctx.template Alloc<T>(&px),
+                      scale.template data<BatchNormParamType<T>>(),
+                      bias.template data<BatchNormParamType<T>>(),
+                      running_mean_data,
+                      running_var_data,
+                      epsilon,
+                      C,
+                      H * W * D,
+                      num,
+                      x.data<T>(),
+                      grid2,
+                      block,
+                      stream);
+    }
+
+    if (compute_format == DataLayout::kNCHW) {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNCHW><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNCHW><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    } else {
+      if (d_x) {
+        KeBNBackwardData<T,
+                         phi::DataLayout::kNHWC><<<grid1, block, 0, stream>>>(
+            d_y->data<T>(),
+            scale.data<BatchNormParamType<T>>(),
+            running_var_data,
+            epsilon,
+            C,
+            H * W,
+            num,
+            d_x->data<T>());
+      }
+      if (d_scale && d_bias) {
+        KeBNBackwardScaleBias<
+            T,
+            block,
+            phi::DataLayout::kNHWC><<<grid2, block, 0, stream>>>(
+            d_y->data<T>(),
+            x.data<T>(),
+            running_mean_data,
+            running_var_data,
+            epsilon,
+            N,
+            C,
+            H * W * D,
+            d_scale->data<BatchNormParamType<T>>(),
+            d_bias->data<BatchNormParamType<T>>());
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormGradKernel(const Context &dev_ctx,
+                         const DenseTensor &y_grad,
+                         const DenseTensor &x,
+                         const DenseTensor &scale,
+                         const DenseTensor &bias,
+                         const DenseTensor &saved_mean,
+                         const DenseTensor &saved_variance,
+                         paddle::optional<const DenseTensor &> reserve_space,
+                         paddle::optional<const DenseTensor &> mean,
+                         paddle::optional<const DenseTensor &> variance,
+                         float momentum,
+                         float epsilon,
+                         const std::string &data_layout,
+                         bool is_test,
+                         bool use_global_stats,
+                         bool trainable_statistics,
+                         bool fuse_with_relu,
+                         DenseTensor *x_grad,
+                         DenseTensor *scale_grad,
+                         DenseTensor *bias_grad) {
+  BatchNormGradRawKernel<T, Context>(dev_ctx,
+                                     y_grad,
+                                     x,
+                                     scale,
+                                     bias,
+                                     saved_mean,
+                                     saved_variance,
+                                     reserve_space,
+                                     mean,
+                                     variance,
+                                     momentum,
+                                     epsilon,
+                                     data_layout,
+                                     is_test,
+                                     use_global_stats,
+                                     trainable_statistics,
+                                     fuse_with_relu,
+                                     false,
+                                     x_grad,
+                                     scale_grad,
+                                     bias_grad);
+}
+
+template <typename T, typename Context>
+void BatchNormDoubleGradKernel(const Context &ctx,
+                               const DenseTensor &x_grad_grad,
+                               const DenseTensor &scale_grad_grad,
+                               const DenseTensor &bias_grad_grad,
+                               const DenseTensor &y_grad,
+                               const DenseTensor &x,
+                               const DenseTensor &scale,
+                               const DenseTensor &saved_mean,
+                               const DenseTensor &saved_variance,
+                               paddle::optional<const DenseTensor &> mean,
+                               paddle::optional<const DenseTensor &> variance,
+                               float momentum,
+                               float epsilon,
+                               const std::string &data_layout_str,
+                               bool is_test,
+                               bool use_global_stats,
+                               bool trainable_statistics,
+                               bool fuse_with_relu,
+                               DenseTensor *x_grad,
+                               DenseTensor *scale_grad,
+                               DenseTensor *y_grad_grad) {
+  PADDLE_ENFORCE_EQ(is_test,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "`is_test = True` CANNOT be used in train program. If "
+                        "you want to use global status in pre_train model, "
+                        "please set `use_global_stats = True`"));
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+
+  const DenseTensor *running_mean = nullptr;
+  const DenseTensor *running_variance = nullptr;
+  if (use_global_stats) {
+    running_mean = mean.get_ptr();
+    running_variance = variance.get_ptr();
+  }
+  paddle::operators::NormDoubleGradFunctor<Context, T>(ctx,
+                                                       data_layout,
+                                                       &x,
+                                                       &scale,
+                                                       &y_grad,
+                                                       &saved_mean,
+                                                       &saved_variance,
+                                                       running_mean,
+                                                       running_variance,
+                                                       epsilon,
+                                                       use_global_stats,
+                                                       &x_grad_grad,
+                                                       &scale_grad_grad,
+                                                       &bias_grad_grad,
+                                                       x_grad,
+                                                       scale_grad,
+                                                       y_grad_grad);
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+PD_REGISTER_KERNEL(batch_norm_grad_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormGradRawKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+
+#else
+PD_REGISTER_KERNEL(batch_norm_grad_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormDoubleGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
new file mode 100644
index 0000000000000..6ad12245d2a45
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -0,0 +1,680 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+#include "paddle/fluid/operators/norm_utils.cu.h"
+#include "paddle/fluid/operators/norm_utils.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/layout_utils.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#include "paddle/fluid/platform/flags.h"
+#include "paddle/phi/kernels/gpu/batch_norm_utils.h"
+
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace phi {
+
+template <typename T>
+using CudnnDataType = paddle::platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BNForwardInference(const T *x,
+                                          const BatchNormParamType<T> *mean,
+                                          const BatchNormParamType<T> *variance,
+                                          const BatchNormParamType<T> *scale,
+                                          const BatchNormParamType<T> *bias,
+                                          const int C,
+                                          const int N,
+                                          const int HxW,
+                                          const double epsilon,
+                                          T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, phi::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    double exponentialAverageFactor,
+    T *y,
+    BatchNormParamType<T> *mean,
+    BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == phi::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context &ctx,
+                     const DenseTensor &x,
+                     const DenseTensor &scale,
+                     const DenseTensor &bias,
+                     const DenseTensor &mean,
+                     const DenseTensor &variance,
+                     float momentum,
+                     float epsilon_f,
+                     const std::string &data_layout_str,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     DenseTensor *y,
+                     DenseTensor *mean_out,
+                     DenseTensor *variance_out,
+                     DenseTensor *saved_mean,
+                     DenseTensor *saved_variance,
+                     DenseTensor *reserve_space) {
+  double epsilon = epsilon_f;
+  const bool trainable_stats = trainable_statistics;
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  bool test_mode = is_test && (!trainable_stats);
+
+  // Get the size for each dimension.
+  // NCHW [batch_size, in_channels, in_height, in_width]
+  const auto &x_dims = x.dims();
+  PADDLE_ENFORCE_EQ(
+      x_dims.size() >= 2 && x_dims.size() <= 5,
+      true,
+      phi::errors::InvalidArgument(
+          "The size of input's dimensions should be between 2 and 5"
+          "But received: the size of input's dimensions is [%d]",
+          x_dims.size()));
+
+  ctx.template Alloc<T>(y);
+  int N, C, H, W, D;
+  paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+  auto dtype = paddle::platform::CudnnDataType<T>::type;
+
+#ifdef PADDLE_WITH_HIP
+  auto compute_format =
+      data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
+#else
+  const bool fast_nhwc_batch_norm =
+      test_mode ||
+      (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent);
+
+  auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC
+                            ? DataLayout::kNHWC
+                            : DataLayout::kNCHW;
+#endif
+
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_y(y->type());
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform input tensor from NHWC to NCHW.";
+    ResizeToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    TransToChannelFirst<Context, T>(ctx, &x, &transformed_x);
+    ResizeToChannelFirst<Context, T>(ctx, y, &transformed_y);
+  } else {
+    transformed_x.ShareDataWith(x);
+    transformed_y.ShareDataWith(*y);
+  }
+
+// ------------------- cudnn descriptors ---------------------
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t bn_param_desc_;
+  cudnnBatchNormMode_t mode_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
+#elif CUDNN_VERSION_MIN(7, 0, 1)
+  if (FLAGS_cudnn_batchnorm_spatial_persistent) {
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+  } else if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#else
+  if (H == 1 && W == 1) {
+    mode_ = CUDNN_BATCHNORM_PER_ACTIVATION;
+  } else {
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+  }
+#endif  // CUDNN_VERSION_MIN(7, 0, 1)
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  if (compute_format == DataLayout::kNCHW) {
+    dims = {N, C, H, W, D};
+    strides = {C * H * W * D, H * W * D, W * D, D, 1};
+  } else {
+    dims = {N, C, H, W, D};
+    strides = {H * W * D * C, 1, W * D * C, D * C, C};
+  }
+
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnSetTensorNdDescriptor(
+          data_desc_,
+          CudnnDataType<T>::type,
+          x_dims.size() > 3 ? x_dims.size() : 4,
+          dims.data(),
+          strides.data()));
+  // Note: PERSISTENT not implemented for inference
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDeriveBNTensorDescriptor(
+          bn_param_desc_,
+          data_desc_,
+          test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_));
+#endif
+
+  auto handle = ctx.cudnn_handle();
+
+  // Now, depending on whether we are running test or not, we have two paths.
+  // It is training mode when it's not reference AND not using pre-trained
+  // model.
+  bool training = !test_mode && !use_global_stats;
+  if (!training) {
+    // only when test we use input to do computation.
+    const auto *est_mean = &mean;
+    const auto *est_var = &variance;
+    // Run inference mode.
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of mean's dimensions must equal to 1."
+            "But received: the size of mean's dimensions mean is [%d],"
+            "the dimensions of mean is [%s].",
+            est_mean->dims().size(),
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The size of variance's dimensions must equal to 1."
+            "But received: the size of variance's dimensions is [%d],"
+            "the dimensions of variance is [%s].",
+            est_var->dims().size(),
+            est_var->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_mean->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of mean must equal to the number of "
+            "Channels, which is [%d]. But received: the first dimension"
+            "of mean is [%d], the dimensions of mean is [%s].",
+            C,
+            est_mean->dims()[0],
+            est_mean->dims()));
+    PADDLE_ENFORCE_EQ(
+        est_var->dims()[0],
+        C,
+        phi::errors::InvalidArgument(
+            "The first dimension of variance must equal to the number"
+            "of Channels, which is [%d]. But received: the first dimension of"
+            "variance is [%d], the dimensions of variance is [%s].",
+            C,
+            est_var->dims()[0],
+            est_var->dims()));
+
+#ifdef PADDLE_WITH_HIP
+    const int block_size = 256;
+    const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+    if (compute_format == DataLayout::kNCHW) {
+      BNForwardInference<
+          T,
+          DataLayout::kNCHW><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    } else {
+      BNForwardInference<
+          T,
+          DataLayout::kNHWC><<<grid_size, block_size, 0, ctx.stream()>>>(
+          transformed_x.template data<T>(),
+          est_mean->template data<BatchNormParamType<T>>(),
+          est_var->template data<BatchNormParamType<T>>(),
+          scale.template data<BatchNormParamType<T>>(),
+          bias.template data<BatchNormParamType<T>>(),
+          C,
+          N,
+          H * W * D,
+          epsilon,
+          transformed_y.template data<T>());
+    }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cudnnBatchNormalizationForwardInference(
+            handle,
+            // Note: PERSISTENT not implemented for inference
+            CUDNN_BATCHNORM_SPATIAL,
+            CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(),
+            data_desc_,
+            transformed_x.template data<T>(),
+            data_desc_,
+            ctx.template Alloc<T>(&transformed_y),
+            bn_param_desc_,
+            scale.template data<BatchNormParamType<T>>(),
+            bias.template data<BatchNormParamType<T>>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            epsilon));
+#endif
+  } else {
+    // if MomentumTensor is set, use MomentumTensor value, momentum
+    // is only used in this training branch
+
+    // need to solve here
+    // if (ctx.HasInput("MomentumTensor")) {
+    //   const auto *mom_tensor = MomentumTensor;
+    //   DenseTensor mom_cpu;
+    //   paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(),
+    //                                     &mom_cpu);
+    //   momentum = mom_cpu.data<float>()[0];
+    // }
+
+    // Run training mode.
+    // obtain running mean and running inv var, and there is no need
+    // to initialize them.
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    if ((N * H * W * D) == 1) {
+      // Only 1 element in normalization dimension,
+      // skip the batch norm calculation, let y = x.
+      paddle::framework::TensorCopy(x, ctx.GetPlace(), y);
+    } else {
+      double this_factor = 1. - momentum;
+
+      bool called = false;
+#if CUDNN_VERSION_MIN(7, 4, 1)
+      called = true;
+      size_t workspace_size = 0;
+      size_t reserve_space_size = 0;
+      void *reserve_space_ptr = nullptr;
+      void *workspace_ptr = nullptr;
+      DenseTensor workspace_tensor;
+      // Create reserve space and workspace for batch norm.
+      // Create tensor for each batchnorm op, it will be used in the
+      // backward. Thus this tensor shouldn't be temp.
+      // auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+      PADDLE_ENFORCE_NOT_NULL(
+          reserve_space,
+          phi::errors::NotFound(
+              "The argument ReserveSpace of batch_norm op is not found."));
+      // --------------- cudnn batchnorm workspace ---------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnIps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*xDesc=*/data_desc_,
+                  /*zDesc=*/nullptr,
+                  /*yDesc=*/data_desc_,
+                  /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                  /*activationDesc=*/nullptr,
+                  /*sizeInBytes=*/&workspace_size));
+
+      // -------------- cudnn batchnorm reserve space --------------
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::
+              cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+                  /*handle=*/handle,
+                  /*mode=*/mode_,
+                  /*bnOps=*/CUDNN_BATCHNORM_OPS_BN,
+                  /*activationDesc=*/nullptr,
+                  /*xDesc=*/data_desc_,
+                  /*sizeInBytes=*/&reserve_space_size));
+
+      reserve_space_ptr = reserve_space->mutable_data(
+          ctx.GetPlace(), transformed_x.type(), reserve_space_size);
+      workspace_ptr = workspace_tensor.mutable_data(
+          ctx.GetPlace(), transformed_x.type(), workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+              handle,
+              mode_,
+              CUDNN_BATCHNORM_OPS_BN,
+              CudnnDataType<T>::kOne(),
+              CudnnDataType<T>::kZero(),
+              data_desc_,
+              transformed_x.template data<T>(),
+              nullptr,
+              nullptr,
+              data_desc_,
+              transformed_y.template data<T>(),
+              bn_param_desc_,
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              this_factor,
+              mean_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              variance_out->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              epsilon,
+              saved_mean->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              saved_variance->template mutable_data<BatchNormParamType<T>>(
+                  ctx.GetPlace()),
+              nullptr,
+              workspace_ptr,
+              workspace_size,
+              reserve_space_ptr,
+              reserve_space_size));
+#endif  // CUDNN_VERSION_MIN(7, 4, 1)
+      if (!called) {
+#ifdef PADDLE_WITH_HIP
+        const int num = transformed_x.numel();
+        const int block = 256;
+        const int max_threads = ctx.GetMaxPhysicalThreadCount();
+        const int max_blocks = std::max(max_threads / block, 1);
+        const int grid = std::min(C, max_blocks);
+        if (compute_format == DataLayout::kNCHW) {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNCHW><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        } else {
+          BNForwardTraining<
+              T,
+              block,
+              DataLayout::kNHWC><<<grid, block, 0, ctx.stream()>>>(
+              transformed_x.template data<T>(),
+              scale.template data<BatchNormParamType<T>>(),
+              bias.template data<BatchNormParamType<T>>(),
+              C,
+              N,
+              H * W * D,
+              epsilon,
+              this_factor,
+              transformed_y.template data<T>(),
+              mean_out->template data<BatchNormParamType<T>>(),
+              variance_out->template data<BatchNormParamType<T>>(),
+              saved_mean->template data<BatchNormParamType<T>>(),
+              saved_variance->template data<BatchNormParamType<T>>());
+        }
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
+#else
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            paddle::platform::dynload::cudnnBatchNormalizationForwardTraining(
+                handle,
+                mode_,
+                CudnnDataType<T>::kOne(),
+                CudnnDataType<T>::kZero(),
+                data_desc_,
+                transformed_x.template data<T>(),
+                data_desc_,
+                ctx.template Alloc<T>(&transformed_y),
+                bn_param_desc_,
+                scale.template data<BatchNormParamType<T>>(),
+                bias.template data<BatchNormParamType<T>>(),
+                this_factor,
+                mean_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                variance_out->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                epsilon,
+                saved_mean->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                saved_variance->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace())));
+#endif
+      }
+    }
+  }
+
+  if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW &&
+      x_dims.size() > 2) {
+    VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+    TransToChannelLast<Context, T>(ctx, &transformed_y, y);
+  }
+#ifdef PADDLE_WITH_HIP
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_GPU_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+#else
+  // clean when exit.
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(batch_norm,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::BatchNormKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+
+#endif
diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h
new file mode 100644
index 0000000000000..c9c62026edfa7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/batch_norm_utils.h
@@ -0,0 +1,142 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+using Tensor = DenseTensor;
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelFirst(const DeviceContext& context,
+                                 const Tensor* input,
+                                 Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[4];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    in_dims_vec[4] = input->dims()[3];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[3];
+    in_dims_vec[2] = input->dims()[1];
+    in_dims_vec[3] = input->dims()[2];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+
+  } else if (dim == 2) {
+    // input
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = phi::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(phi::make_ddim(in_dims_vec));
+    context.template Alloc<T>(transformed_input);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelFirst(const DeviceContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  VLOG(5) << "Why am I called?";
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 4, 1, 2, 3};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 3, 1, 2};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void TransToChannelLast(const DeviceContext& context,
+                               const Tensor* input,
+                               Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    std::vector<int> axis{0, 2, 3, 4, 1};
+    funcs::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context, *input, transformed_input, axis);
+
+  } else if (dim == 2) {
+    std::vector<int> axis{0, 2, 3, 1};
+    funcs::Transpose<DeviceContext, T, 4> trans4;
+    trans4(context, *input, transformed_input, axis);
+  } else if (dim == 1) {
+    std::vector<int> axis{0, 2, 1};
+    funcs::Transpose<DeviceContext, T, 3> trans3;
+    trans3(context, *input, transformed_input, axis);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
new file mode 100644
index 0000000000000..011d4c12ecefc
--- /dev/null
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm",
+                         {"X", "Scale", "Bias", "Mean", "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"Y",
+                          "MeanOut",
+                          "VarianceOut",
+                          "SavedMean",
+                          "SavedVariance",
+                          "ReserveSpace"});
+}
+
+KernelSignature BatchNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "batch_norm_grad",
+      {GradVarName("Y"),
+       "X",
+       "Scale",
+       "Bias",
+       "SavedMean",
+       "SavedVariance",
+       "ReserveSpace",
+       "Mean",
+       "Variance"},
+      {"momentum",
+       "epsilon",
+       "data_layout",
+       "is_test",
+       "use_global_stats",
+       "trainable_statistics",
+       "fuse_with_relu"},
+      {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")});
+}
+
+KernelSignature BatchNormGradGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("batch_norm_grad_grad",
+                         {"DDX",
+                          "DDScale",
+                          "DDBias",
+                          "DY",
+                          "X",
+                          "Scale",
+                          "SavedMean",
+                          "SavedVariance",
+                          "Mean",
+                          "Variance"},
+                         {"momentum",
+                          "epsilon",
+                          "data_layout",
+                          "is_test",
+                          "use_global_stats",
+                          "trainable_statistics",
+                          "fuse_with_relu"},
+                         {"DX", "DScale", "DDY"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad,
+                           phi::BatchNormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad,
+                           phi::BatchNormGradGradOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 30c1955adcf9f..c6f491a5484d9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -520,6 +520,7 @@ def predict_static(args, data):
     paddle.enable_static()
     exe = fluid.Executor(args.place)
     # load inference model
+
     [inference_program, feed_target_names,
      fetch_targets] = fluid.io.load_inference_model(
          args.model_save_dir,
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
index 4552d600bafd7..2b281d7d6f7c5 100644
--- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -162,6 +162,7 @@ def test_main(self):
         for k, v in self.get_strategy().items():
             setattr(build_strategy, k, v)
         self.check_before_applied(main2, startup2)
+
         apply_build_strategy(main2, startup2, build_strategy,
                              {"use_cuda": self.use_cuda})
         self.check_after_applied(main2, startup2)
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index cce13a8bf3b74..b02df024518a8 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -320,7 +320,7 @@ def check_with_place(self, place, data_layout, dtype, shape):
 
     def test_check_output(self):
         places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -342,13 +342,13 @@ def setUp(self):
 
     def test_check_output(self):
         places = []
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             place = core.CUDAPlace(0)
             if core.is_float16_supported(place):
                 places.append(place)
-
         for place in places:
-            for data_format in ["NCHW", "NHWC"]:
+            #for data_format in ["NCHW", "NHWC"]:
+            for data_format in ["NCHW"]:
                 self.check_with_place(place, data_format, self.dtype,
                                       [2, 3, 4, 5])
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
@@ -517,7 +517,7 @@ def test_with_place(place, data_layout, shape):
 
         places = [core.CPUPlace()]
 
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(core.CUDAPlace(0))
 
         for place in places:
@@ -657,7 +657,7 @@ def test_errors(self):
 class TestDygraphBatchNormTrainableStats(unittest.TestCase):
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -678,7 +678,7 @@ def compute(x, is_test, trainable_statistics):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -716,4 +716,6 @@ def test_reservespace(self):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 6a6f85a483206..c9abac8fb7946 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -28,7 +28,7 @@
 class TestBatchNorm(unittest.TestCase):
     def test_name(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             with fluid.dygraph.guard(p):
@@ -36,7 +36,7 @@ def test_name(self):
 
     def test_error(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             #paddle.disable_static()
@@ -83,7 +83,7 @@ def error3d():
 
     def test_dygraph(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             shape = [4, 10, 4, 4]
@@ -135,7 +135,7 @@ def compute_v4(x):
 
     def test_static(self):
         places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             places.append(fluid.CUDAPlace(0))
         for p in places:
             exe = fluid.Executor(p)
@@ -177,7 +177,7 @@ def setUp(self):
         else:
             paddle.set_default_dtype("float64")
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
 
     def tearDown(self):
@@ -247,7 +247,7 @@ def test_3d(self):
 class TestBatchNormUseGlobalStats(unittest.TestCase):
     def setUp(self):
         self.places = [fluid.CPUPlace()]
-        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+        if core.is_compiled_with_cuda():
             self.places.append(fluid.CUDAPlace(0))
         self.init_test()
 
@@ -300,4 +300,6 @@ def init_test(self):
 
 
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 8ea4e369d3236..826f886dab172 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 
 import paddle
 import paddle.fluid.core as core
@@ -1001,4 +1002,5 @@ def init_paddings(self):
     TestWithDilation_AsyPadding, grad_check=False)
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index aee6ca249f535..a204c26c1b823 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -231,4 +231,5 @@ def test_api(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
index 077496200d988..67f6b91021472 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -23,6 +23,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid import compiler
 import paddle.fluid.unique_name as unique_name
+import paddle
 
 
 class TestInplaceANBOpTraining(unittest.TestCase):
@@ -138,14 +139,14 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
                 outs[0].name if not only_forward else None,
                 build_strategy=build_strategy,
                 exec_strategy=exec_strategy)
-            bn_fetches = exe.run(program=comp_prog1,
+            bn_fetches = exe.run(program=main,
                                  feed={'input': data},
                                  fetch_list=fetch_name)
             fetch_outs.append(bn_fetches)
             fetch_names.append(fetch_name)
 
-        for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs +
-                                                           fetch_names)):
+        for bn_val, inplace_abn_val, name1, name2 in zip(*(
+                fetch_outs + fetch_names)):
             self.assertTrue(
                 np.allclose(
                     bn_val, inplace_abn_val, atol=1e-2),
@@ -156,6 +157,7 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda):
 
     def test_op(self):
         use_cudas = [False, True] if core.is_compiled_with_cuda() else [False]
+        #use_cudas = [False]
         for use_cuda in use_cudas:
             place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
             layouts = ["NCHW", "NHWC"]
@@ -186,4 +188,5 @@ def test_all_branches(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
index fe8c181b79049..49fe397644dc6 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
@@ -21,6 +21,7 @@
 import paddle.fluid.layers as layers
 import paddle.fluid.core as core
 import gradient_checker
+import paddle
 
 from decorator_helper import prog_scope
 
@@ -167,4 +168,5 @@ def init_test(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
index b01c7cf179955..a1a3b31a9766e 100755
--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -24,6 +24,7 @@
 import seresnext_net
 from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType
 from fake_reader import fake_imdb_reader
+import paddle
 
 
 def lstm_net(use_feed):
@@ -309,4 +310,5 @@ def program_scope_guard(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py
index c860d6972fb76..40481b097827c 100755
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -507,4 +507,5 @@ def test_reshape_zero_tensor_error(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 71c69507cd9530cf49a72a8fcd083d2e8eb3e96b Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 3 Mar 2022 09:56:28 +0800
Subject: [PATCH 33/41] [Eager][YAML] Supported array-type parsing for output
 tensors (#40058)

---
 .../final_state_generator/eager_gen.py                    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 02183e2ca5ce9..f2088dcda7685 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -213,8 +213,12 @@ def ParseYamlReturns(string):
 
     returns = [x.strip() for x in string.strip().split(",")]
     for i in range(len(returns)):
-        ret = returns[i]
-        returns_list.append(["", ret, i])
+        ret_type = returns[i]
+
+        assert ret_type in yaml_types_mapping.keys()
+        ret_type = yaml_types_mapping[ret_type]
+
+        returns_list.append(["", ret_type, i])
 
     return returns_list
 

From 6bf85eafc8dc0ab57c87bbf51e7ac225ba05776c Mon Sep 17 00:00:00 2001
From: zhangkaihuo <zhangkaihuo@baidu.com>
Date: Thu, 3 Mar 2022 10:01:38 +0800
Subject: [PATCH 34/41] Implement SparseConv3d kernel (#39784)

* sparse conv3d: gpu code
---
 paddle/phi/core/sparse_coo_tensor.h           |   1 +
 paddle/phi/kernels/sparse/CMakeLists.txt      |   2 +-
 .../kernels/sparse/cpu/convolution_kernel.cc  |   4 +-
 .../kernels/sparse/gpu/convolution_kernel.cu  | 612 ++++++++++++++++++
 .../kernels/test_sparse_conv3d_dev_api.cc     | 102 +++
 5 files changed, 717 insertions(+), 4 deletions(-)
 create mode 100644 paddle/phi/kernels/sparse/gpu/convolution_kernel.cu

diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index 0dd5d543414fe..ca3290f33e61e 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -145,6 +145,7 @@ class SparseCooTensor : public TensorBase,
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
                      size_t requested_size = 0) override;
+  void set_dims(const DDim& dims) { this->dims_ = dims; }
 
  private:
   // save the indices of non zero elements in original dense tensor
diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt
index 3e4a968b7a8a5..a319e9a13c3f7 100644
--- a/paddle/phi/kernels/sparse/CMakeLists.txt
+++ b/paddle/phi/kernels/sparse/CMakeLists.txt
@@ -1,3 +1,3 @@
 
-set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
+set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function)
 register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel")
diff --git a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
index fdf255bd542e6..93397d4c93100 100644
--- a/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "paddle/phi/kernels/sparse/cpu/convolution.h"
 
 namespace phi {
 namespace sparse {
@@ -55,7 +54,6 @@ void Conv3dKernel(const Context& dev_ctx,
   // 1. product rulebook
   DenseTensorMeta counter_meta(
       DataType::INT32, {kernel_size}, DataLayout::NCHW);
-  // DenseTensor rulebook = phi::Empty<int, Context>(dev_ctx);
   DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
 
   ProductRuleBook<T, Context>(dev_ctx,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
new file mode 100644
index 0000000000000..aeb9409c417ba
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -0,0 +1,612 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "glog/logging.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/primitive/compute_primitives.h"
+#include "paddle/phi/kernels/sparse/convolution_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+// TODO(zhangkaihuo) replace this kernel with KP::InitWithDataIndex
+__global__ void InitByIndexKernel(const int n, int* out1, int* out2) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
+    out1[i] = i;
+    out2[i] = i;
+  }
+}
+
+/**
+ * @brief: update the out index and indices
+ * unique_keys: save the index of the output feature list
+ * unique_values: indiates the index of key before deduplication
+ * out_indexs: indicates the position of the output index in the rulebook
+ * rulebook_len: indicates the length of rulebook
+ * out_dims: indicates the output dims
+ * out_indices: the indices of output, out_indices = IndexToPoint(unique_keys)
+ * rulebook_out_indexs: the output index in rulebook
+**/
+__global__ void UpdateIndexKernel(const int* unique_keys,
+                                  const int* unique_values,
+                                  const int* out_indexs,
+                                  const int non_zero_num,
+                                  const int rulebook_len,
+                                  const Dims4D out_dims,
+                                  int* out_indices,
+                                  int* rulebook_out_indexs) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    const int index = unique_keys[i];
+    int batch, x, y, z;
+    IndexToPoint<Dims4D>(index, out_dims, &batch, &x, &y, &z);
+    // get out indices
+    out_indices[i] = batch;
+    out_indices[i + non_zero_num] = z;
+    out_indices[i + non_zero_num * 2] = y;
+    out_indices[i + non_zero_num * 3] = x;
+
+    // update rulebook
+    int start = unique_values[i];
+    int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
+    // max(end-start) = kernel_size
+    for (int j = start; j < end; j++) {
+      rulebook_out_indexs[out_indexs[j]] = i;
+    }
+  }
+}
+
+/**
+ * @brief product rulebook
+ * for input_i in x_indices:
+ *   if input_i participate in the convolution calculation:
+ *       infer the output_i by input_i and kernel_i
+ *       save output_i
+ *
+ * x_indices: the indices of input features
+ * x_dims: the input dims
+ * kernel_dims: the kernel dims
+ * out_dims: the output dims
+ * non_zero_num: the number of input features
+ * rulebook: the rulebook to save the kernel index, input index and output index
+ * counter: save the number of times each location in the kernel participates in
+ *the caculation
+**/
+__global__ void ProductRuleBookKernel(const int* x_indices,
+                                      const Dims4D x_dims,
+                                      const Dims4D kernel_dims,
+                                      const Dims4D out_dims,
+                                      const int64_t non_zero_num,
+                                      const Dims4D paddings,
+                                      const Dims4D dilations,
+                                      const Dims4D strides,
+                                      int* rulebook,
+                                      int* counter) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  extern __shared__ int counter_buf[];  // kernel_size
+  const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
+  const int offset = kernel_size * non_zero_num;
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    counter_buf[i] = 0;
+  }
+  __syncthreads();
+
+  for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
+    int kernel_index = 0;
+    for (int kz = 0; kz < kernel_dims[1]; kz++) {
+      for (int ky = 0; ky < kernel_dims[2]; ky++) {
+        for (int kx = 0; kx < kernel_dims[3]; kx++) {
+          int batch = x_indices[i];
+          int in_z = x_indices[i + non_zero_num];
+          int in_y = x_indices[i + 2 * non_zero_num];
+          int in_x = x_indices[i + 3 * non_zero_num];
+          int in_i = -1, out_index = -1;
+          if (Check(x_dims,
+                    kernel_dims,
+                    paddings,
+                    dilations,
+                    strides,
+                    in_x,
+                    in_y,
+                    in_z,
+                    kx,
+                    ky,
+                    kz)) {
+            int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
+            int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
+            int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
+            in_i = i;
+            out_index =
+                PointToIndex<Dims4D>(batch, out_x, out_y, out_z, out_dims);
+            atomicAdd(&counter_buf[kernel_index], 1);
+          }
+          rulebook[kernel_index * non_zero_num + i] = in_i;
+          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
+          ++kernel_index;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  for (int i = threadIdx.x; i < kernel_size; i += blockDim.x) {
+    atomicAdd(&counter[i], counter_buf[i]);
+  }
+}
+
+// TODO(zhangkaihuo): After the GatherCUDAKernel is migrated to phi, replace
+// this kernel with phi::GatherCUDAKernel;
+// Vectorization can be used to improve read and write bandwidth
+/**
+ * brief: gather data from params according to indices
+ * params: the inputs
+ * indices: the indices you want to gather
+ * output: the outputs
+ * index_size: the size of indices
+ * slice_size: slice size corresponding to each index, here is the channel size
+**/
+template <typename T, typename IndexT = int>
+__global__ void GatherKernel(const T* params,
+                             const IndexT* indices,
+                             T* output,
+                             size_t index_size,
+                             size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = indices[indices_i];
+    int64_t params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+// brief: calculation the distance between start and end
+__global__ void DistanceKernel(const int* start,
+                               const int* end,
+                               int* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+// the basic algorithm can refer to convolution_kernel.cc or
+// the second paper
+// example:
+// 1. the rulebook:
+//  the kernel_index:                       0, 0, 0, 1, 1, 1, 2, 2, ....
+//  the out_index(key):                     20, 30, 33, 30, 33, 20, 25
+// 2. mark the index of out_index(value):   0, 1, 2, 3, 4, 5, 6, ....
+// 3. sorted the (key, value)
+// 4. unique the (key, value):
+//  unique_key:     20, 25, 30, 33
+//  unique_values:  0, 2, 3, 5
+//  the index of unique_values is: 0, 1, 2, 3
+// 5. update the out_index by unique_key, uniqe_value and the index of
+// unique_value:
+//  the new out_index: 0, 2, 3, 2, 3, 0, 1
+template <typename T, typename Context>
+int ProductRuleBook(const Context& dev_ctx,
+                    const SparseCooTensor& x,
+                    const DenseTensor& kernel,
+                    const std::vector<int>& paddings,
+                    const std::vector<int>& dilations,
+                    const std::vector<int>& strides,
+                    const DDim& out_dims,
+                    DenseTensor* rulebook,
+                    DenseTensor* counter_per_kernel,
+                    DenseTensor* offsets_per_kernel,
+                    DenseTensor* out_index,
+                    DenseTensor* unique_key,
+                    DenseTensor* unique_value,
+                    SparseCooTensor* out,
+                    std::vector<int>* h_counter,
+                    std::vector<int>* h_offsets) {
+  const auto& kernel_dims = kernel.dims();
+  const int64_t non_zero_num = x.nnz();
+  const auto& non_zero_indices = x.non_zero_indices();
+  const int* indices_ptr = non_zero_indices.data<int>();
+  dev_ctx.Alloc(counter_per_kernel,
+                counter_per_kernel->dtype(),
+                sizeof(int) * counter_per_kernel->numel());
+  int* counter_ptr = counter_per_kernel->data<int>();
+  dev_ctx.Alloc(offsets_per_kernel,
+                offsets_per_kernel->dtype(),
+                sizeof(int) * offsets_per_kernel->numel());
+  int* offsets_ptr = offsets_per_kernel->data<int>();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  rulebook->ResizeAndAllocate({2, kernel_size * non_zero_num});
+  dev_ctx.Alloc(rulebook, rulebook->dtype(), sizeof(int) * rulebook->numel());
+  int* rulebook_ptr = rulebook->data<int>();
+
+  const auto x_dims = x.dims();
+  Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
+  Dims4D d_kernel_dims(1, kernel_dims[2], kernel_dims[1], kernel_dims[0]);
+  Dims4D d_out_dims(out_dims[0], out_dims[3], out_dims[2], out_dims[1]);
+  Dims4D d_paddings(1, paddings[2], paddings[1], paddings[0]);
+  Dims4D d_strides(1, strides[2], strides[1], strides[0]);
+  Dims4D d_dilations(1, dilations[2], dilations[1], dilations[0]);
+
+  // 1. product rule book
+  phi::funcs::SetConstant<Context, int> set_zero;
+  set_zero(dev_ctx, counter_per_kernel, 0);
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
+
+  ProductRuleBookKernel<<<config.block_per_grid.x,
+                          config.thread_per_block.x,
+                          kernel_size * sizeof(int),
+                          dev_ctx.stream()>>>(indices_ptr,
+                                              d_x_dims,
+                                              d_kernel_dims,
+                                              d_out_dims,
+                                              non_zero_num,
+                                              d_paddings,
+                                              d_dilations,
+                                              d_strides,
+                                              rulebook_ptr,
+                                              counter_ptr);
+
+// 2. remove -1
+#ifdef PADDLE_WITH_HIP
+  int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                             rulebook_ptr,
+                             rulebook_ptr + 2 * kernel_size * non_zero_num,
+                             -1);
+
+#ifdef PADDLE_WITH_HIP
+  thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                         counter_ptr,
+                         counter_ptr + kernel_size,
+                         offsets_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     hipMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_counter)[0],
+                                     counter_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+  phi::backends::gpu::GpuMemcpyAsync(&(*h_offsets)[0],
+                                     offsets_ptr,
+                                     kernel_size * sizeof(int),
+                                     cudaMemcpyDeviceToHost,
+                                     dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+  int rulebook_len =
+      (*h_counter)[kernel_size - 1] + (*h_offsets)[kernel_size - 1];
+
+  // 3. sorted or merge the out index
+  out_index->ResizeAndAllocate({rulebook_len});
+  unique_value->ResizeAndAllocate({rulebook_len});
+  unique_key->ResizeAndAllocate({rulebook_len});
+  dev_ctx.Alloc(
+      out_index, out_index->dtype(), sizeof(int) * out_index->numel());
+  int* out_index_ptr = out_index->data<int>();
+  dev_ctx.Alloc(
+      unique_value, unique_value->dtype(), sizeof(int) * unique_value->numel());
+  int* unique_value_ptr = unique_value->data<int>();
+  dev_ctx.Alloc(
+      unique_key, unique_key->dtype(), sizeof(int) * unique_key->numel());
+  int* unique_key_ptr = unique_key->data<int>();
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+  InitByIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(
+      rulebook_len, out_index_ptr, unique_value_ptr);
+
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     hipMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(unique_key_ptr,
+                                     rulebook_ptr + rulebook_len,
+                                     rulebook_len * sizeof(int),
+                                     cudaMemcpyDeviceToDevice,
+                                     dev_ctx.stream());
+#endif
+
+// compared with thrust::sort_by_key, thrust::merge_by_key may achieved higher
+// performance, but thrust::merge_by_key limited by data size
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      unique_key_ptr,
+                      unique_key_ptr + rulebook_len,
+                      out_index_ptr);
+
+  // 4. unique
+  thrust::pair<int*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            unique_key_ptr,
+                            unique_key_ptr + rulebook_len,
+                            unique_value_ptr);
+  // thrust::distance doesn't support stream parameters
+  // const int out_non_zero_num = thrust::distance(unique_key_ptr,
+  // new_end.first);
+  DistanceKernel<<<1, 1>>>(unique_key_ptr,
+                           new_end.first,
+                           rulebook_ptr + 2 * kernel_size * non_zero_num - 1);
+  int out_non_zero_num = 0;
+#ifdef PADDLE_WITH_HIP
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      hipMemcpyDeviceToHost,
+      dev_ctx.stream());
+#else
+  phi::backends::gpu::GpuMemcpyAsync(
+      &out_non_zero_num,
+      rulebook_ptr + 2 * kernel_size * non_zero_num - 1,
+      sizeof(int),
+      cudaMemcpyDeviceToHost,
+      dev_ctx.stream());
+#endif
+  dev_ctx.Wait();
+
+  // 5. update out_indices and rulebook by unique_value_ptr
+  const int64_t sparse_dim = 4;
+  DenseTensorMeta indices_meta(
+      DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
+  DenseTensorMeta values_meta(
+      x.dtype(), {out_non_zero_num, kernel_dims[4]}, x.layout());
+  phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
+  phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
+
+  dev_ctx.Alloc(
+      &out_indices, out_indices.dtype(), sizeof(int) * out_indices.numel());
+  int* out_indices_ptr = out_indices.data<int>();
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
+  UpdateIndexKernel<<<config.block_per_grid.x,
+                      config.thread_per_block.x,
+                      0,
+                      dev_ctx.stream()>>>(unique_key_ptr,
+                                          unique_value_ptr,
+                                          out_index_ptr,
+                                          out_non_zero_num,
+                                          rulebook_len,
+                                          d_out_dims,
+                                          out_indices_ptr,
+                                          rulebook_ptr + rulebook_len);
+  out->SetMember(out_indices, out_values, out_dims, true);
+  return rulebook_len;
+}
+
+/**
+ * x: (N, D, H, W, C)
+ * kernel: (D, H, W, C, OC)
+ * out: (N, D, H, W, OC)
+**/
+template <typename T, typename Context>
+void Conv3dKernel(const Context& dev_ctx,
+                  const SparseCooTensor& x,
+                  const DenseTensor& kernel,
+                  const std::vector<int>& paddings,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const int groups,
+                  SparseCooTensor* out,
+                  DenseTensor* rulebook) {
+  // update padding and dilation
+  // Currently, only support x.layout is NDHWC, groups = 1
+  // if x.layout != NDHWC then transpose(x), transpose(weight)
+
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
+  DDim out_dims = {1, 1, 1, 1, 1};
+  GetOutShape(x_dims, kernel_dims, paddings, dilations, strides, &out_dims);
+  out->set_dims(out_dims);
+  const int in_channels = kernel_dims[3];
+  const int out_channels = kernel_dims[4];
+  std::vector<int> offsets(kernel_size + 1), h_counter(kernel_size);
+
+  // Second algorithm:
+  // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
+  // 1. product rulebook
+  DenseTensorMeta counter_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensorMeta offsets_meta(
+      DataType::INT32, {kernel_size}, DataLayout::NCHW);
+  DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
+  DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
+  DenseTensor out_index = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_key = phi::Empty<int, Context>(dev_ctx);
+  DenseTensor unique_value = phi::Empty<int, Context>(dev_ctx);
+
+  int n = ProductRuleBook<T, Context>(dev_ctx,
+                                      x,
+                                      kernel,
+                                      paddings,
+                                      dilations,
+                                      strides,
+                                      out_dims,
+                                      rulebook,
+                                      &counter_per_kernel,
+                                      &offsets_per_kernel,
+                                      &out_index,
+                                      &unique_key,
+                                      &unique_value,
+                                      out,
+                                      &h_counter,
+                                      &offsets);
+
+  const int* counter_ptr = counter_per_kernel.data<int>();
+  const int* offsets_ptr = counter_per_kernel.data<int>();
+
+  // 2. gather
+  DenseTensorMeta in_features_meta(
+      x.dtype(), {n, in_channels}, DataLayout::NCHW);
+  DenseTensorMeta out_features_meta(
+      x.dtype(), {n, out_channels}, DataLayout::NCHW);
+  phi::DenseTensor in_features =
+      phi::Empty(dev_ctx, std::move(in_features_meta));
+  phi::DenseTensor out_features =
+      phi::Empty(dev_ctx, std::move(out_features_meta));
+  dev_ctx.Alloc(
+      &in_features, in_features.dtype(), sizeof(T) * in_features.numel());
+  T* in_features_ptr = in_features.data<T>();
+  dev_ctx.Alloc(
+      &out_features, out_features.dtype(), sizeof(T) * out_features.numel());
+  T* out_features_ptr = out_features.data<T>();
+
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+  GatherKernel<T, int><<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                                             rulebook->data<int>(),
+                                             in_features_ptr,
+                                             n,
+                                             in_channels);
+
+  // 3. call gemm for every werght
+  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
+  auto* out_values = out->mutable_non_zero_elements();
+  dev_ctx.Alloc(
+      out_values, out_values->dtype(), sizeof(T) * out_values->numel());
+  T* out_values_ptr = out_values->data<T>();
+
+  const T* kernel_ptr = kernel.data<T>();
+  for (int i = 0; i < kernel_size; i++) {
+    if (h_counter[i] <= 0) {
+      continue;
+    }
+
+    // call gemm: (n, in_channels) * (in_channels, out_channels)
+    const int M = h_counter[i];
+    const int K = in_channels;
+    const int N = out_channels;
+    T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels;
+    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+    T* tmp_out_ptr = out_features_ptr + offsets[i] * out_channels;
+
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
+              M,
+              N,
+              K,
+              static_cast<T>(1),
+              tmp_in_ptr,
+              tmp_kernel_ptr,
+              static_cast<T>(0),
+              tmp_out_ptr);
+  }
+
+  // 4. scatter
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+      dev_ctx, out->nnz() * out_channels, 1);
+  ScatterKernel<T><<<config.block_per_grid.x,
+                     config.thread_per_block.x,
+                     0,
+                     dev_ctx.stream()>>>(out_features_ptr,
+                                         unique_value.data<int>(),
+                                         out_index.data<int>(),
+                                         out->nnz(),
+                                         n,
+                                         out_channels,
+                                         out_values_ptr);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sparse_conv3d,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::Conv3dKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index 00b2a256a9504..ace95b55055a1 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
@@ -151,6 +152,107 @@ void TestConv3dBase(const std::vector<int>& indices,
       f_verify(grads[1].data<T>(), kernel_grad);
     }
   }
+
+// test gpu
+#if defined(PADDLE_WITH_CUDA)
+  phi::GPUContext dev_ctx_gpu;
+  dev_ctx_gpu.PartialInitWithoutAllocator();
+  dev_ctx_gpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(dev_ctx_gpu.GetPlace(), dev_ctx_gpu.stream())
+          .get());
+  dev_ctx_gpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(phi::CPUPlace())
+          .get());
+  dev_ctx_gpu.PartialInitWithAllocator();
+
+  DenseTensor d_indices_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
+  dev_ctx_gpu.Alloc(&d_indices_tensor,
+                    d_indices_tensor.dtype(),
+                    sizeof(int) * d_indices_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
+
+  DenseTensor d_features_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {non_zero_num, in_channels},
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_features_tensor,
+                    d_features_tensor.dtype(),
+                    sizeof(T) * d_features_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, features_tensor, phi::GPUPlace(), true, &d_features_tensor);
+
+  SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
+
+  DenseTensor d_kernel_tensor = phi::Empty(
+      dev_ctx_gpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      kernel_dims,
+                      DataLayout::NHWC));
+  dev_ctx_gpu.Alloc(&d_kernel_tensor,
+                    d_kernel_tensor.dtype(),
+                    sizeof(T) * d_kernel_tensor.numel());
+  phi::Copy(
+      dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
+
+  DenseTensor d_rulebook = phi::Empty<int, phi::GPUContext>(dev_ctx_gpu);
+  SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
+                                            d_x_tensor,
+                                            d_kernel_tensor,
+                                            paddings,
+                                            dilations,
+                                            strides,
+                                            1,
+                                            &d_rulebook);
+
+  ASSERT_EQ(correct_out_dims.size(), d_out.dims().size());
+  ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, d_out.nnz());
+  for (int i = 0; i < correct_out_dims.size(); i++) {
+    ASSERT_EQ(correct_out_dims[i], d_out.dims()[i]);
+  }
+
+  DenseTensor h_indices_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
+  dev_ctx_cpu.Alloc(&h_indices_tensor,
+                    h_indices_tensor.dtype(),
+                    sizeof(int) * h_indices_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_indices(),
+            phi::CPUPlace(),
+            true,
+            &h_indices_tensor);
+
+  int cmp_indices2 = memcmp(correct_out_indices.data(),
+                            h_indices_tensor.data<int>(),
+                            correct_out_indices.size() * sizeof(int));
+  ASSERT_EQ(cmp_indices2, 0);
+
+  DenseTensor h_features_tensor = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
+                      {d_out.nnz()},
+                      d_out.layout()));
+
+  dev_ctx_cpu.Alloc(&h_features_tensor,
+                    h_features_tensor.dtype(),
+                    sizeof(T) * h_features_tensor.numel());
+  phi::Copy(dev_ctx_gpu,
+            d_out.non_zero_elements(),
+            phi::CPUPlace(),
+            true,
+            &h_features_tensor);
+  for (uint64_t i = 0; i < correct_out_features.size(); i++) {
+    float tmp = std::fabs(static_cast<float>(correct_out_features[i] -
+                                             h_features_tensor.data<T>()[i]));
+    ASSERT_LT(tmp, diff);
+  }
+#endif
 }
 
 void TestConv3d(const std::vector<int>& indices,

From 909d1e617c36cf19822cb3b96ea14783cda6dfff Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:05:59 +0800
Subject: [PATCH 35/41] Modified Reduce for XPU2 (#38918)

1. set xpu2 block_size = 64
2. fix a bug when reduce_num is too large
---
 paddle/phi/kernels/gpu/reduce.h | 130 ++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 49 deletions(-)

diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 9223a94c12aeb..94c2e980e36a1 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -178,6 +178,8 @@ struct IndexCalculator {
       : dim(dim) {
     dims = details::VectorToArray<int, kMaxRank>(cal_dims);
     strides = details::VectorToArray<int, kMaxRank>(full_strides);
+    reduce_strides = details::VectorToArray<int, kMaxRank>(cal_strides);
+#ifndef PADDLE_WITH_XPU_KP
     std::vector<paddle::platform::FastDivMod> cal_divmoders;
     // fast divmod
     for (auto i : cal_strides) {
@@ -185,9 +187,22 @@ struct IndexCalculator {
     }
     divmoders = details::VectorToArray<paddle::platform::FastDivMod, kMaxRank>(
         cal_divmoders);
+#endif
   }
 
   __device__ inline int operator()(int offset) const {
+#ifdef PADDLE_WITH_XPU_KP
+    int index = 0;
+#pragma unroll
+    for (int i = 0; i < kMaxRank; ++i) {
+      if (i == dim) {
+        break;
+      }
+      index += (offset / reduce_strides[i]) * strides[dims[i]];
+      offset = offset % reduce_strides[i];
+    }
+    return index;
+#else
     int index = 0;
 #pragma unroll
     for (int i = 0; i < kMaxRank; ++i) {
@@ -199,12 +214,16 @@ struct IndexCalculator {
       offset = divmod.val[1];
     }
     return index;
+#endif
   }
 
   int dim;
   phi::Array<int, kMaxRank> dims;
   phi::Array<int, kMaxRank> strides;
+  phi::Array<int, kMaxRank> reduce_strides;
+#ifndef PADDLE_WITH_XPU2
   phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+#endif
 };
 
 template <bool ReduceLastDim = false>
@@ -247,7 +266,7 @@ struct ReduceIndexMapping {
 
   __device__ __forceinline__ int BlockDimY() {
 #ifdef PADDLE_WITH_XPU2
-    return dim.deal_size_y;
+    return 1;
 #else
     return blockDim.y;
 #endif
@@ -454,10 +473,14 @@ struct ReduceConfig {
     bool is_last_dim =
         (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1);
     if (rank == reduce_rank || is_last_dim) {
+#ifdef PADDLE_WITH_XPU_KP
+      reduce_type = static_cast<int>(ReduceType::kReduceAny);
+#else
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
+#endif
     } else if (reduce_rank == 1) {
 // ReduceFirstDim and reduceSecondDim
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
       if (reduce_dim[0] == 0) {
         reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
       } else {
@@ -471,6 +494,7 @@ struct ReduceConfig {
     }
   }
 
+#ifndef PADDLE_WITH_XPU_KP
   void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) {
     constexpr int min_reduce_num_per_thread = 16;
     constexpr int max_reduce_num_per_thread = 256;
@@ -569,6 +593,7 @@ struct ReduceConfig {
       grid_dim->y = details::AlignUp(reduce_num, blocking_size);
     }
   }
+#endif
 
   void SetBlockDim() {
     // init
@@ -577,14 +602,14 @@ struct ReduceConfig {
     dim3 block_dim(block_num, 1, 1);
     dim3 grid_dim(left_num, 1, 1);
     blocking_size = reduce_num;
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     if (reduce_last_dim) {
-      block_dim.x = 128;
+      block_dim.x = 64;
       block_dim.y = reduce_num;
-      grid_dim.x = 8;
-      grid_dim.y = 1;
+      grid_dim.x = 1;
+      grid_dim.y = 8;
     } else {
-      block_dim.x = 128;
+      block_dim.x = 64;
       block_dim.y = left_num;
       grid_dim.x = 8;
       grid_dim.y = 1;
@@ -661,7 +686,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
     store_offset = block.BlockIdY() * left_num + left_idx;
     loop_left = min(block.GetLoopSize(), left_num - left_idx);
     stride_left = 1;
-    tid = threadIdx.x;
+    tid = THREAD_ID_X;
   } else {
     auto block = ReduceIndexMapping<false>(dim);
     input_idx = block.BlockIdY() * block.BlockDimY();
@@ -672,18 +697,20 @@ __global__ void ReduceAnyKernel(const Tx* x,
     loop_left = min(block.GetLoopSize(), left_num - left_idx);
     stride_left = block.BlockDimX() * block.GridDimX();
     store_offset = block.BlockIdY() * left_num + left_idx;
-    tid = threadIdx.y;
+    tid = THREAD_ID_Y;
   }
   // calculate the offset, means the addr where each thread really start.
   // 1. reduce for each thread
   MPType input_compute[REDUCE_VEC_SIZE];
   Tx input_reg[REDUCE_VEC_SIZE];
+  int input_idx_tmp = input_idx;
   for (int i = 0; i < loop_left; i += stride_left) {
     int input_offset = left_index_calculator(left_idx + i);
-    const Tx* input = x + input_offset;
+    const _ptr_ Tx* input = x + input_offset;
     MPType reduce_var = init;
     // load REDUCE_VEC_SIZE data once, and then compute
     int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
+    input_idx = input_idx_tmp;
     for (; input_idx + block_size < bound;
          input_idx += REDUCE_VEC_SIZE * stride) {
       kps::ReadDataReduce<Tx,
@@ -775,7 +802,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
   int loop_size = min(reduce_num - idy, blocking_size);
   int store_offset = block.BlockIdY() * left_num + idz * block.GridDimY();
   int block_offset = idy * left_num + idz * reduce_num;
-  const Tx* input = x + block_offset;
+  const _ptr_ Tx* input = x + block_offset;
   Tx reduce_input;
   for (; idx < size; idx += stride) {
     MPType reduce_var = init;
@@ -838,7 +865,7 @@ static void LaunchReduceKernel(const Tx* x_data,
                                const ReduceOp& reducer,
                                const TransformOp& transform,
                                MPType init,
-                               gpuStream_t stream,
+                               KPStream stream,
                                ReduceConfig<Ty> config) {
   if (config.reduce_type == kReduceLastDim) {
     int stride_reduce = 1;
@@ -855,23 +882,24 @@ static void LaunchReduceKernel(const Tx* x_data,
                                         0);
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    OneDimIndexCal><<<8, 128, stream>>>(x_data,
-                                                        config.output_data,
-                                                        reducer,
-                                                        transform,
-                                                        init,
-                                                        config.reduce_num,
-                                                        config.left_num,
-                                                        config.reduce_last_dim,
-                                                        reduce_index_calculator,
-                                                        left_index_calculator,
-                                                        dim);
+                    OneDimIndexCal><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        init,
+        config.reduce_num,
+        config.left_num,
+        config.reduce_last_dim,
+        reduce_index_calculator,
+        left_index_calculator,
+        dim);
 #else
     ReduceAnyKernel<Tx,
                     Ty,
@@ -910,13 +938,13 @@ static void LaunchReduceKernel(const Tx* x_data,
                                         0);
     dim.SetRem(config.reduce_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceAnyKernel<Tx,
                     Ty,
                     MPType,
                     ReduceOp,
                     TransformOp,
-                    IndexCalculator><<<8, 128, stream>>>(
+                    IndexCalculator><<<8, 64, 0, stream>>>(
         x_data,
         config.output_data,
         reducer,
@@ -965,12 +993,13 @@ static void LaunchReduceKernel(const Tx* x_data,
     kps::DimConfig dim =
         kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
     dim.SetRem(config.left_num % block.x, 0, 0);
-#ifdef PADDLE_WITH_XPU2
-    ReduceHigherDimKernel<Ty,
-                          Ty,
-                          MPType,
-                          ReduceOp,
-                          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+#ifdef PADDLE_WITH_XPU_KP
+    ReduceHigherDimKernel<
+        Ty,
+        Ty,
+        MPType,
+        ReduceOp,
+        kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
         config.output_data,
         y_data,
         reducer,
@@ -1011,7 +1040,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     const TransformOp& transform,
                     int reduce_num,
                     const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
   auto reducer = ReduceOp<Ty>();
   cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
                                                                   transform);
@@ -1054,7 +1083,7 @@ CubTensorReduceImpl(const Tx* x_data,
                     const TransformOp& transform,
                     int reduce_num,
                     const paddle::platform::Place& place,
-                    gpuStream_t stream) {
+                    KPStream stream) {
   PADDLE_THROW(phi::errors::InvalidArgument(
       "Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
 }
@@ -1068,7 +1097,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                       phi::DenseTensor* y,
                       const TransformOp& transform,
                       const std::vector<int>& origin_reduce_dims,
-                      gpuStream_t stream) {
+                      KPStream stream) {
   y->mutable_data<Ty>(x.place());
 
   auto x_dim = phi::vectorize<int>(x.dims());
@@ -1098,11 +1127,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
   config.SetOutputData(y_data, x.place(), &tmp);
   constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
   bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
+#ifndef PADDLE_WITH_XPU_KP
   if (use_cub_reduce) {
     CubTensorReduceImpl<Tx, Ty, ReduceOp, TransformOp>(
         x_data, y_data, transform, config.reduce_num, x.place(), stream);
     return;
   }
+#endif
 
   using MPType = typename kps::details::MPTypeTrait<Ty>::Type;
   auto reducer = ReduceOp<MPType>();
@@ -1124,20 +1155,21 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
                config.reduce_num % config.blocking_size,
                0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
     ReduceHigherDimKernel<Tx,
                           Ty,
                           MPType,
                           ReduceOp<MPType>,
-                          TransformOp><<<8, 128, stream>>>(x_data,
-                                                           config.output_data,
-                                                           reducer,
-                                                           transform,
-                                                           reducer.initial(),
-                                                           config.reduce_num,
-                                                           config.left_num,
-                                                           config.blocking_size,
-                                                           dim);
+                          TransformOp><<<8, 64, 0, stream>>>(
+        x_data,
+        config.output_data,
+        reducer,
+        transform,
+        reducer.initial(),
+        config.reduce_num,
+        config.left_num,
+        config.blocking_size,
+        dim);
 #else
     ReduceHigherDimKernel<
         Tx,
@@ -1163,13 +1195,13 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
           kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
       dim2.SetRem(config.left_num % config.block.x, 0, 0);
 
-#ifdef PADDLE_WITH_XPU2
+#ifdef PADDLE_WITH_XPU_KP
       ReduceHigherDimKernel<
           Ty,
           Ty,
           MPType,
           ReduceOp<MPType>,
-          kps::IdentityFunctor<Ty, MPType>><<<8, 128, stream>>>(
+          kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
           config.output_data,
           y_data,
           reducer,
@@ -1212,7 +1244,7 @@ void TensorReduceImpl(const phi::GPUContext& dev_ctx,
 template <typename T,
           template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
-void Reduce(const GPUContext& dev_ctx,
+void Reduce(const KPDevice& dev_ctx,
             const DenseTensor& x,
             bool reduce_all,
             const std::vector<int64_t>& dims,
@@ -1227,7 +1259,7 @@ void Reduce(const GPUContext& dev_ctx,
     reduce_num *= (x.dims())[i];
   }
 
-  gpuStream_t stream = dev_ctx.stream();
+  KPStream stream = dev_ctx.stream();
 
   if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
     auto tmp_tensor = phi::Cast<T>(dev_ctx, x, out_dtype);

From d9884e2077d024a2439b8864b21885402f228af7 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Thu, 3 Mar 2022 10:06:11 +0800
Subject: [PATCH 36/41] adjust the args checking of backward in yaml (#40091)

---
 python/paddle/utils/code_gen/backward_api_gen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index 2d33cd5b1812a..125ebed82de8b 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -56,8 +56,9 @@ def check_args(self, forward_config):
 
         # check the attributes of backward
         for attr in self.attrs['names']:
-            assert attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
-                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+            assert (attr in fw_attrs['names'] and self.attrs['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0]) or \
+                 self.attrs['attr_info'][attr][1] is not None, \
+                f"{self.api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api or doesn't have default value. \
                  Please check the args of {self.api} in yaml."
 
         # check the output of backward

From da47544cc2bbc829b1c0f54854b532582d867156 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Thu, 3 Mar 2022 10:13:22 +0800
Subject: [PATCH 37/41] Support slim eager (#39874)

* eager, test=develop

* fix bug, test=develop

* eager, test=develop

* merge legacy to fluid

* eager, test=develop

* eager, test=develop

* Refactor TensorAdd func by template and remove gradient_accumulation in eager

* Remove needless target name

* eager, test=develop

* eager, test=develop

* Use overload instead of template

* Remove legacy code

* Remove legacy code

* selectedrows, test=develop

* Remove DataType test

* eager, test=develop

* eager, test=develop

* support gan, test=develop

* Using Tensor directly instead of using EagerTensor

* support gradient_accumulation

* make test_imperative_lod_tensor_to_selected_rows longer

* make test_imperative_lod_tensor_to_selected_rows longer

* refine code

* ptb, test=develop

* Rename all EagerTensor to Tensor

* Rename some EagerTensor to Tensor

* rename EagerTensor to EagerVariable

* eager, test=develop

* eager, test=develop

* eager, test=develop

* eager, test=develop

* add more test

* eager, test=develop

* Support copiable selected rows and merge develop

* save load, eager, test=develop

* save load, eager, test=develop

* refine, test=develop

* remove useless _set_value method

* refine, test=develop

* refine, test=develop

* revert static_runner, test=develop

* EagerTensor to Tensor, test=develop

* refine, test=develop

* refine, test=develop

* clear grad, test=develop

* merge, develop

* merge, develop

* merge, test=develop

* merge, test=develop

* Support quant and part of slice

* support legacy static save

* extend slim tests time

* remove imperative on inference

* remove imperative on inference

* merge develop

* fix typo

* fix typo

* split slice related code into 2 part for imperative and eager

* split slice from inference

* split slice from inference

* fix test_tensor_register_hook

Co-authored-by: Wang Huan <wanghuan29@baidu.com>
Co-authored-by: Weilong Wu <veyron_wu@163.com>
Co-authored-by: wanghuancoder <wanghuancoder@163.com>
---
 .../eager/accumulation/accumulation_node.h    |   5 +-
 .../eager_generated/backwards/scale_node.h    |   2 +-
 .../auto_code_generator/eager_generator.cc    |  67 ++--
 .../final_state_generator/eager_gen.py        |   7 +-
 paddle/fluid/eager/backward.cc                |   7 +-
 paddle/fluid/eager/grad_node_info.cc          |   9 +-
 paddle/fluid/eager/grad_node_info.h           |   4 +-
 .../data_structure_tests/grad_node_test.h     |   1 +
 paddle/fluid/eager/utils.cc                   |  15 +-
 paddle/fluid/pybind/eager_method.cc           | 142 ++++++++-
 paddle/fluid/pybind/eager_utils.cc            |   9 +
 paddle/fluid/pybind/eager_utils.h             |   5 +-
 paddle/fluid/pybind/imperative.cc             | 284 ++---------------
 paddle/fluid/pybind/pybind.cc                 |   5 +-
 paddle/fluid/pybind/slice_utils.h             | 294 ++++++++++++++++++
 .../fluid/contrib/slim/tests/CMakeLists.txt   |   8 +-
 .../slim/tests/test_imperative_out_scale.py   |  16 +-
 .../contrib/slim/tests/test_imperative_ptq.py |  15 +-
 .../contrib/slim/tests/test_imperative_qat.py |   9 +-
 .../slim/tests/test_imperative_qat_amp.py     |   2 +-
 .../tests/test_imperative_qat_user_defined.py |   9 +-
 .../slim/tests/test_imperative_skip_op.py     |   9 +-
 python/paddle/fluid/dygraph/base.py           |  17 +-
 .../dygraph_to_static/partial_program.py      |  99 ++++--
 python/paddle/fluid/dygraph/jit.py            |  53 ++--
 .../fluid/dygraph/varbase_patch_methods.py    |   4 +-
 python/paddle/fluid/io.py                     |   2 +-
 python/paddle/fluid/layers/nn.py              |   3 +
 python/paddle/fluid/layers/tensor.py          |   4 +-
 .../tests/unittests/test_egr_python_api.py    |   8 +-
 .../unittests/test_tensor_register_hook.py    |   8 +-
 31 files changed, 700 insertions(+), 422 deletions(-)
 create mode 100644 paddle/fluid/pybind/slice_utils.h

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 734cabdc3dc91..07fa40165167c 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
+    VLOG(6) << "Construct GradNodeAccumulation";
     weak_grad_ = meta->WeakGrad();
     SetDefaultGradInOutMeta();
   }
 
-  ~GradNodeAccumulation() override = default;
+  ~GradNodeAccumulation() override {
+    VLOG(6) << "Destruct GradNodeAccumulation";
+  }
 
   // Functor: perform backward computations
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
index c0150a1730d52..247fde6ed1f86 100644
--- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
+++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h
@@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase {
       const std::vector<paddle::experimental::Tensor>& tensors);
 
   void SetAttributes_scale(float scale);
-
+  std::string name() override { return ""; }
   // Members: define fwd input tensors
   // For Scale there is no fwd input tensor needed
  private:
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 102fad5637380..2fc846cccc22e 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -996,6 +996,29 @@ static std::string GenerateGradNodeCreationContent(
   // then generate: "egr::AutogradMeta* p_autograd_out =
   // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")"
   std::string get_autograd_meta_str = "  // Prepare Autograd Meta \n";
+  // If single output slotname and not duplicable,
+  // then generate: "egr::AutogradMeta* p_autograd_out =
+  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
+  for (const proto::OpProto::Var& output : out_vars) {
+    const std::string& output_name = output.name();
+    const std::string& output_autograd_name = "p_autograd_" + output_name;
+
+    if (output.duplicable()) {
+      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
+          "  std::vector<egr::AutogradMeta*> %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    } else {
+      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
+          "  egr::AutogradMeta* %s = "
+          "egr::EagerUtils::autograd_meta(&%s);\n";
+      get_autograd_meta_str += paddle::string::Sprintf(
+          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
+    }
+  }
+  VLOG(6) << "Generated outputs autograd_meta";
+
   for (const proto::OpProto::Var& input : in_vars) {
     const std::string& input_name = input.name();
     const std::string& input_autograd_name = "p_autograd_" + input_name;
@@ -1024,31 +1047,6 @@ static std::string GenerateGradNodeCreationContent(
   }
   VLOG(6) << "Generated inputs autograd_meta";
 
-  // If single output slotname and not duplicable,
-  // then generate: "egr::AutogradMeta* p_autograd_out =
-  // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")"
-  for (const proto::OpProto::Var& output : out_vars) {
-    const std::string& output_name = output.name();
-    const std::string& output_autograd_name = "p_autograd_" + output_name;
-
-    // Skip Intermediate Tensor
-
-    if (output.duplicable()) {
-      const char* GET_MULTI_AUTOGRAD_META_TEMPLATE =
-          "  std::vector<egr::AutogradMeta*> %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    } else {
-      const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE =
-          "  egr::AutogradMeta* %s = "
-          "egr::EagerUtils::autograd_meta(&%s);\n";
-      get_autograd_meta_str += paddle::string::Sprintf(
-          GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name);
-    }
-  }
-  VLOG(6) << "Generated outputs autograd_meta";
-
   std::string prepare_autograd_meta_str = "";
   prepare_autograd_meta_str += get_autograd_meta_str;
   prepare_autograd_meta_str += "\n";
@@ -1204,11 +1202,12 @@ static std::string GenerateGradNodeCreationContent(
       "  %s"
       "  bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n"
       "  if(require_any_grad) {\n"
+      "    VLOG(6) << \" Construct Grad for %s \"; \n"
       "    egr::EagerUtils::PassStopGradient(%s);\n"
       "%s\n  }";
   std::string grad_node_creation_body_str = paddle::string::Sprintf(
       GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str,
-      compute_require_grad_args, pass_stop_gradient_args,
+      compute_require_grad_args, op_type, pass_stop_gradient_args,
       grad_node_creation_str);
 
   return grad_node_creation_body_str;
@@ -2083,22 +2082,24 @@ static std::string GenerateGradNodeHeaderContents(
   const char* GRAD_NODE_TEMPLATE =
       "class GradNode%s : public egr::GradNodeBase {\n"
       " public:\n"
-      "  GradNode%s() : egr::GradNodeBase() {}\n"
+      "  GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct "
+      "GradNode%s \"; }\n"
       "  GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : "
-      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n"
-      "  ~GradNode%s() override = default;\n"
+      "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" "
+      "Construct GradNode%s \"; }\n"
+      "  ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
       "\n"
       "  virtual std::vector<std::vector<paddle::experimental::Tensor>> "
       "operator()(const "
       "std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
       "override;\n"
       "\n"
+      "  std::string name() override { return \" GradNode%s \"; } \n "
+      "\n"
       "  // SetX, SetY, ...\n"
       "%s\n"
       "  // SetAttrMap\n"
       "%s\n"
-      "  std::string name() { return \"GradNode%s\"; }\n"
-      "\n"
       " private:\n"
       "   // TensorWrappers\n"
       "%s\n"
@@ -2195,8 +2196,8 @@ static std::string GenerateGradNodeHeaderContents(
   VLOG(6) << "Generated TensorWrapper";
 
   std::string grad_node_str = paddle::string::Sprintf(
-      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type,
-      set_tensor_wrappers_str, set_attr_map_str, op_type,
+      GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type,
+      op_type, op_type, set_tensor_wrappers_str, set_attr_map_str,
       tensor_wrapper_members_str, attr_members_str);
 
   return grad_node_str;
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index f2088dcda7685..af9540b6fb3ad 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -538,7 +538,7 @@ class {} : public egr::GradNodeBase {{
 
   virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
-  
+  std::string name() override {{ return \" {} \"; }}
   // SetTensorWrapperX, SetTensorWrapperY, ...
   {}
   // SetAttributes
@@ -553,8 +553,9 @@ class {} : public egr::GradNodeBase {{
 """
     node_declaration_str = NODE_DECLARATION_TEMPLATE.format(
         grad_node_name, grad_node_name, grad_node_name, grad_node_name,
-        set_tensor_wrapper_methods_str, set_attribute_methods_str,
-        tensor_wrapper_members_str, attribute_members_str)
+        grad_node_name, set_tensor_wrapper_methods_str,
+        set_attribute_methods_str, tensor_wrapper_members_str,
+        attribute_members_str)
 
     return node_declaration_str
 
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index 356fdcaf05427..934497d7d179c 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -48,12 +48,16 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
     }
     visited.insert(node);
 
+    PADDLE_ENFORCE_NOT_NULL(
+        node,
+        paddle::platform::errors::Fatal(
+            "We got null node when we traverse the backward graph, and this "
+            "should not happened please check your code and contact us."));
     // Find and append next nodes
     const std::vector<std::vector<Edge>>& edges = node->GetEdges();
     for (const auto& edge_list : edges) {
       for (const Edge& edge : edge_list) {
         GradNodeBase* next_node = edge.GetMutableGradNode().get();
-
         // Next node could be nullptr if it is leaf tensor with no
         // AccumulationNode attached
         // Or it could also originated from dispensable inputs
@@ -67,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
       }
     }
   }
-
   return node_in_degree_map;
 }
 
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index b1189106b8f87..427be83c3bbee 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -30,6 +30,7 @@
 namespace egr {
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  VLOG(6) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
   // adj_edges has the same num as backward outputs
@@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
     // its pre-ops
     if (meta && !meta->StopGradient()) {
       auto node = meta->GetMutableGradNode();
-      if (node) {
+      if (node && node.get()) {
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       } else {
         meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>(meta));
+        VLOG(6) << "Add Edges for slot: " << slot_id
+                << " which is: " << meta->GetMutableGradNode()->name();
         adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
                                          meta->OutRankInfo());
       }
@@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
           "inputs's slot num."));
   if (meta && !meta->StopGradient()) {
     auto node = meta->GetMutableGradNode();
-    if (node) {
+    if (node && node.get()) {
       VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from "
               << this->name() << " to " << meta->GetMutableGradNode()->name();
       adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index eeac1cca4acf3..16513f05e0777 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -76,10 +76,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() = default;
+  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() = default;
+  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index bb84e2dda81ba..535c93ac53b17 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase {
   GradTestNode(float val, int in_num, int out_num)
       : GradNodeBase(in_num, out_num), val_(val) {}
   GradTestNode() : GradNodeBase() { val_ = 1.0; }
+  std::string name() override { return "GradTestNode"; }
   std::vector<std::vector<paddle::experimental::Tensor>> operator()(
       const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
       override {
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 39861c80522a9..8a57d2694535e 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad(
 void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
   for (const auto& autograd_meta : *autograd_metas) {
-    if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-      VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is "
-                 "detected";
+    if (autograd_meta->GradNode()) {
+      VLOG(7) << "Should not set grad node twice, original node is:"
+              << autograd_meta->GradNode()->name()
+              << "current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
@@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
 
 void EagerUtils::SetHistory(AutogradMeta* autograd_meta,
                             const std::shared_ptr<GradNodeBase>& grad_node) {
-  if (dynamic_cast<GradNodeAccumulation*>(autograd_meta->GradNode())) {
-    VLOG(6)
-        << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected";
+  if (autograd_meta->GradNode()) {
+    VLOG(7) << "Should not set grad node twice, original node is:"
+            << autograd_meta->GradNode()->name()
+            << "current is: " << grad_node->name();
   }
-
   autograd_meta->SetGradNode(grad_node);
 }
 
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index f11a2ab2517fb..e5f22338dc615 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/accumulation/accumulation_node.h"
 #include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h"
 #include "paddle/fluid/eager/autograd_meta.h"
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
@@ -30,10 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
 #include "paddle/fluid/pybind/exception.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 namespace paddle {
 namespace pybind {
 
@@ -119,6 +122,29 @@ extern void InitTensorWithNumpyValue(TensorObject* self,
 
 extern PyTypeObject* p_tensor_type;
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Eager";
+    paddle::experimental::Tensor tensor = CastPyArg2Tensor(obj, 0);
+    PADDLE_ENFORCE_EQ(
+        tensor.initialized(), true,
+        paddle::platform::errors::InvalidArgument(
+            "We can only support initialized tensor in slice, however we got "
+            "uninitialized tensor %s, please check your code.",
+            tensor.name()));
+    return GetSliceIndexFromTensor((*static_cast<phi::DenseTensor*>(
+        CastPyArg2Tensor(obj, 0).impl().get())));
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject* obj) {
+  return PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type));
+}
+
 static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                      PyObject* kwargs) {
   EAGER_TRY
@@ -468,16 +494,111 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
-// NOTE(wuweilong): Set value and not change self's original place
-static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args,
-                                         PyObject* kwargs) {
+static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
+                                                  PyObject* args,
+                                                  PyObject* kwargs) {
   EAGER_TRY
-  VLOG(4) << "Value " << self->tensor.name();
-  pybind11::object numpy_value =
-      pybind11::object(pybind11::handle(PyTuple_GET_ITEM(args, 0)), true);
-  InitTensorWithNumpyValue(self, numpy_value, false);
-  Py_INCREF(Py_None);
-  return Py_None;
+  PyObject* _index = PyTuple_GET_ITEM(args, 0);
+  VLOG(4) << "Call _getitem_index_not_tensor";
+  std::vector<int> slice_axes, slice_starts, slice_ends, slice_strides,
+      decrease_axis, none_axes, infer_flags, list_select_idxs;
+  // if index is a list, list_select_flag will be true
+  bool list_select_flag = false;
+  PADDLE_ENFORCE_EQ(
+      self->tensor.is_initialized(), true,
+      platform::errors::InvalidArgument(
+          "tensor %s has not been initialized, we can only slice initialized "
+          "tensor please init it first with numpy or other tensor.",
+          self->tensor.name()));
+  auto tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+  ParseIndexingSlice(tensor, _index, &slice_axes, &slice_starts, &slice_ends,
+                     &slice_strides, &decrease_axis, &none_axes, &infer_flags,
+                     &list_select_idxs, &list_select_flag);
+
+  auto out = slice_axes.empty() && !list_select_flag
+                 ? self->tensor
+                 : paddle::experimental::Tensor(
+                       egr::Controller::Instance().GenerateUniqueName());
+
+  if (!slice_axes.empty()) {
+    framework::AttributeMap attrs = {{"axes", slice_axes},
+                                     {"starts", slice_starts},
+                                     {"ends", slice_ends},
+                                     {"infer_flags", infer_flags},
+                                     {"decrease_axis", decrease_axis}};
+    std::string op_type = "slice";
+    for (auto stride : slice_strides) {
+      if (stride != 1) {
+        op_type = "strided_slice";
+        attrs.insert({"strides", slice_strides});
+        attrs.erase("decrease_axis");
+        break;
+      }
+    }
+    if (op_type == "slice") {
+      out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
+                                   paddle::experimental::Tensor(),
+                                   std::move(attrs));
+    } else if (op_type == "strided_slice") {
+      out = strided_slice_dygraph_function(self->tensor, attrs);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Slice is only support slice and strided_slice, but we got %s which "
+          "is impossible, please check your code first or contact us by "
+          "issue. ",
+          op_type));
+    }
+  }
+
+  if (!none_axes.empty()) {
+    // Deal with cases when all axes are decreased.
+    // After slice, the shape of out is [1], which should have been
+    // [], but Paddle doesn't support scalar.
+    // In order to ensure the correctness of the final shape of out,
+    // one dimension of out needs to be decreased.
+    // For example:
+    // # x.shape: (2,3,4)
+    // out = x[0, 1, 1, None] # out.shape : (1)
+    if (static_cast<int>(decrease_axis.size()) == tensor->dims().size()) {
+      none_axes.pop_back();
+    }
+    if (!none_axes.empty()) {
+      // Deal with cases that decrease_axes is not empty
+      // For example:
+      // # x.shape: (2,3,4)
+      // out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+      for (auto& axis : none_axes) {
+        int len = 0;
+        for (int da : decrease_axis) {
+          if (da < axis) {
+            len++;
+          }
+        }
+        axis -= len;
+      }
+
+      paddle::experimental::Tensor new_out;
+      framework::AttributeMap attrs = {{"axes", none_axes}};
+      new_out = std::get<0>(unsqueeze2_dygraph_function(out, std::move(attrs)));
+      return ToPyObject(new_out);
+    }
+  }
+
+  // the index is a list
+  if (list_select_flag) {
+    auto select_index = paddle::experimental::Tensor(
+        egr::Controller::Instance().GenerateUniqueName());
+    auto idx_tensor = std::make_shared<phi::DenseTensor>();
+    auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
+        egr::Controller::Instance().GetExpectedPlace());
+    paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
+                                        idx_tensor.get());
+    framework::AttributeMap attrs = {{"dim", 0}};
+    out = index_select_dygraph_function(self->tensor, select_index,
+                                        std::move(attrs));
+  }
+
+  return ToPyObject(out);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
@@ -602,7 +723,8 @@ PyMethodDef variable_methods[] = {
     {"get_tensor",
      (PyCFunction)(void (*)(void))tensor_method_get_underline_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
-    {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value,
+    {"_getitem_index_not_tensor",
+     (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_register_grad_hook",
      (PyCFunction)(void (*)(void))tensor_register_grad_hook,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index c1e8822eec221..57f37621d3ba4 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -16,8 +16,11 @@ limitations under the License. */
 
 #include "paddle/fluid/eager/api/all.h"
 #include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/operators/py_func_op.h"
+#include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/eager_utils.h"
@@ -184,6 +187,11 @@ paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
   }
 }
 
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos) {
+  return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
+}
+
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos) {
   std::vector<paddle::experimental::Tensor> result;
@@ -737,5 +745,6 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
 
   return result;
 }
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 0c721d6124791..92afc3ae4875c 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
-
 namespace paddle {
 namespace pybind {
 
@@ -33,6 +32,8 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
 float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
 std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos);
+std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
+                                                       ssize_t arg_pos);
 std::vector<paddle::experimental::Tensor> CastPyArg2VectorOfTensor(
     PyObject* obj, ssize_t arg_pos);
 platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos);
@@ -112,5 +113,7 @@ std::vector<paddle::experimental::Tensor*> GetTensorPtrListFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
     ssize_t arg_idx, bool dispensable = false);
 
+// end of Slice related methods
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8c5ed2d118301..3da17b95a66ba 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -54,6 +54,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/pybind/op_function.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
+#include "paddle/fluid/pybind/slice_utils.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
 namespace paddle {
@@ -319,6 +320,23 @@ static std::string GetTypeName(const imperative::VarBase &var) {
   }
 }
 
+Py_ssize_t GetSliceIndexFromPyObject(PyObject *obj) {
+  if (py::isinstance<imperative::VarBase>(obj)) {
+    VLOG(6) << "Call GetSliceIndexFromTensor in Imperative";
+    return GetSliceIndexFromTensor(
+        py::cast<std::shared_ptr<imperative::VarBase>>(obj)
+            ->Var()
+            .Get<framework::LoDTensor>());
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "We should only get paddle::experimental::Tensor or VarBase in this "
+        "method, when you reach this means we got another type index."));
+  }
+}
+
+bool PyCheckTensor(PyObject *obj) {
+  return py::isinstance<imperative::VarBase>(obj);
+}
 using PyNameVarBaseMap = std::unordered_map<std::string, py::handle>;
 
 // NOTE(zjl): py::handle is a very light wrapper of PyObject *.
@@ -360,18 +378,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
 
   return result;
 }
-static bool IsNumpyType(PyObject *obj) {
-  // It is not a good way to judge the type of obj by its type'name. Maybe using
-  // `PyArray_IsScalar` will be better. However, this interface cannot be used
-  // by including pybind11, and it needs to compile with numpy.
-  auto type_name = std::string(Py_TYPE(obj)->tp_name);
-  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
-         type_name == "numpy.int32" || type_name == "numpy.int16";
-}
-
-static bool PyCheckTensor(PyObject *obj) {
-  return py::isinstance<imperative::VarBase>(obj);
-}
 
 // cast numpy type form S to T, this may allocate new memory
 template <class T, class S>
@@ -429,260 +435,6 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   return result;
 }
 
-static bool PyCheckInteger(PyObject *obj) {
-#if PY_VERSION_HEX < 0x03000000
-  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
-#else
-  return PyLong_Check(obj) && !PyBool_Check(obj);
-#endif
-}
-
-static Py_ssize_t GetSliceIndexFromTensor(
-    const std::shared_ptr<imperative::VarBase> &tensor_index) {
-  const auto &tensor = tensor_index->Var().Get<framework::LoDTensor>();
-  if (tensor.numel() == 1) {
-    if (framework::TransToProtoVarType(tensor.dtype()) ==
-        framework::proto::VarType::INT32) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
-    } else if (framework::TransToProtoVarType(tensor.dtype()) ==
-               framework::proto::VarType::INT64) {
-      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, the type of tensor in slice indices only allows "
-          "int32 and int64, please check the type of index tensor."));
-    }
-  } else {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Currently, tensor in slice indices only allows 1 element, "
-        "but received %d.",
-        tensor.numel()));
-  }
-}
-
-// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
-// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
-// Original PySlice_GetIndices return wrong result when
-// slice_item contains long int, such as arr[:180L].
-// NOT sure why this happens !!!
-// Besides, PySlice_GetIndices cannot raise error when float in slice item.
-// So, I make a revised version of PySlice_GetIndices, named to
-// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
-// PySlice_GetIndices in the future.
-static int _PySlice_GetIndices(PySliceObject *r, Py_ssize_t length,
-                               Py_ssize_t *start, Py_ssize_t *stop,
-                               Py_ssize_t *step) {
-  /* XXX support long ints */
-  if (r->step == Py_None) {
-    *step = 1;
-  } else {
-    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
-      *step = PyLong_AsLong(r->step);
-    } else if (PyCheckTensor(r->step)) {
-      *step = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->step));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->step)->tp_name)));
-    }
-  }
-  if (r->start == Py_None) {
-    *start = *step < 0 ? length - 1 : 0;
-  } else {
-    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
-      *start = PyLong_AsLong(r->start);
-    } else if (PyCheckTensor(r->start)) {
-      *start = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->start));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->start)->tp_name)));
-    }
-    if (*start < 0) *start += length;
-    *start = std::max(*start, static_cast<Py_ssize_t>(0));
-  }
-  if (r->stop == Py_None) {
-    *stop = *step < 0 ? -1 : length;
-  } else {
-    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
-      *stop = PyLong_AsLong(r->stop);
-    } else if (PyCheckTensor(r->stop)) {
-      *stop = GetSliceIndexFromTensor(
-          py::cast<std::shared_ptr<imperative::VarBase>>(r->stop));
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, slice indices only allows None, integers, "
-          "tensor(int) and numpy(int) in slice item, but received %s.",
-          std::string(Py_TYPE(r->stop)->tp_name)));
-    }
-    if (0 < *step && *stop < 0) *stop += length;
-    *stop = std::min(*stop, length);
-  }
-  if (*stop > length) return -1;
-  if (*start >= length) return -1;
-  if (*step == 0) return -1;
-  return 0;
-}
-
-static void ParseIndexingSlice(
-    framework::LoDTensor *tensor, PyObject *_index,
-    std::vector<int> *slice_axes, std::vector<int> *slice_starts,
-    std::vector<int> *slice_ends, std::vector<int> *slice_strides,
-    std::vector<int> *decrease_axis, std::vector<int> *none_axes,
-    std::vector<int> *infer_flags, std::vector<int> *list_select_idxs,
-    bool *list_select_flag) {
-  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
-  // types, and list of Bool and Integers.
-  // wrap to tuple
-
-  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
-  PyObject *index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
-  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
-    if (!PyTuple_Check(_index)) {
-      Py_DECREF(index);
-      VLOG(4) << "Call Py_DECREF";
-    }
-  });
-  PADDLE_ENFORCE_EQ(
-      tensor->IsInitialized(), true,
-      platform::errors::InvalidArgument("tensor has not been initialized"));
-  const auto &shape = tensor->dims();
-  const int rank = shape.size();
-  const int size = PyTuple_GET_SIZE(index);
-
-  // specified_dims is the number of dimensions which indexed by Interger,
-  // Slices.
-  int specified_dims = 0;
-  int ell_count = 0;
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
-      specified_dims++;
-    } else if (slice_item == Py_Ellipsis) {
-      ell_count++;
-    }
-  }
-
-  PADDLE_ENFORCE_LE(ell_count, 1,
-                    platform::errors::InvalidArgument(
-                        "An index can only have a single ellipsis ('...')"));
-  int none_count = 0;
-  for (int i = 0, dim = 0; i < size; ++i) {
-    PyObject *slice_item = PyTuple_GetItem(index, i);
-
-    infer_flags->push_back(1);
-    int dim_len = shape[dim];
-    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
-      // integer, PyLong_AsLong supports both int and long
-      int start = static_cast<int>(PyLong_AsLong(slice_item));
-      auto s_t = start;
-      start = start < 0 ? start + dim_len : start;
-      if (start >= dim_len || start < 0) {
-        std::string str_error_message =
-            "The starting index " + std::to_string(s_t) +
-            " of slice is out of bounds in tensor " + std::to_string(dim) +
-            "-th axis, it shound be in the range of [" +
-            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
-        // py::index_error is corresponding to IndexError in Python
-        // Used to indicate out of bounds access in __getitem__, __setitem__
-        throw py::index_error(str_error_message);
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(start + 1);
-      slice_strides->push_back(1);
-      decrease_axis->push_back(dim);
-      dim++;
-    } else if (PySlice_Check(slice_item)) {
-      // slice item
-      Py_ssize_t start, end, step;
-      PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
-      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
-
-      // :: or : or 0:dim_len:1
-      if (start == 0 && end == dim_len && step == 1) {
-        dim++;
-        continue;
-      }
-      slice_axes->push_back(dim);
-      slice_starts->push_back(start);
-      slice_ends->push_back(end);
-      slice_strides->push_back(step);
-      dim++;
-    } else if (slice_item == Py_Ellipsis) {
-      dim += rank - specified_dims;
-    } else if (slice_item == Py_None) {
-      none_axes->push_back(dim + none_count);
-      none_count++;
-    } else if (PyList_Check(slice_item)) {
-      *list_select_flag = true;
-      PADDLE_ENFORCE_EQ(
-          size, 1,
-          platform::errors::InvalidArgument(
-              "When index contains a list, its length is excepted to 1, "
-              "but received %d",
-              size));
-      bool all_bool = true;
-      int list_size = PyList_GET_SIZE(slice_item);
-      for (int j = 0; j < list_size; ++j) {
-        PyObject *list_item = PyList_GetItem(slice_item, j);
-        if (PyCheckInteger(list_item)) {
-          all_bool = false;
-        } else if (!PyBool_Check(list_item)) {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Only support int or bool in index list."));
-        }
-      }
-      if (all_bool) {
-        PADDLE_ENFORCE_EQ(
-            list_size, shape[0],
-            platform::errors::InvalidArgument(
-                "The dimension of bool index doesn't match indexed array along "
-                "dimension 0, the target dimension is %d, but received %d.",
-                shape[0], list_size));
-
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (list_item == Py_True) {
-            list_select_idxs->push_back(j);
-          }
-        }
-      } else {
-        for (int j = 0; j < list_size; ++j) {
-          PyObject *list_item = PyList_GetItem(slice_item, j);
-          if (PyCheckInteger(list_item)) {
-            list_select_idxs->push_back(
-                static_cast<int>(PyLong_AsLong(list_item)));
-          } else if (list_item == Py_True) {
-            list_select_idxs->push_back(1);
-          } else {
-            list_select_idxs->push_back(0);
-          }
-        }
-      }
-
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Currently, Tensor.__indices__() only allows indexing "
-          "by Integers, Slices, Ellipsis, None, tuples of these types "
-          "and list of Bool and Integers, but received "
-          "%s in %dth slice item",
-          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
-    }
-  }
-
-  // valid_index is the number of dimensions exclude None index
-  const int valid_indexs = size - none_axes->size() - ell_count;
-  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
-                    platform::errors::InvalidArgument(
-                        "Too many indices (%d) for tensor of dimension %d.",
-                        valid_indexs, rank));
-}
-
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2d9272dd0ed27..ffc42dc30edfb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -80,6 +80,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/distributed_py.h"
 #include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
@@ -101,7 +102,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/gloo_context_py.h"
 #include "paddle/fluid/pybind/gloo_wrapper_py.h"
 #include "paddle/fluid/pybind/heter_wrapper_py.h"
-#include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
 #include "paddle/fluid/pybind/metrics_py.h"
@@ -527,6 +527,7 @@ PYBIND11_MODULE(core_avx, m) {
 PYBIND11_MODULE(core_noavx, m) {
 #endif
 
+  BindImperative(&m);
   BindEager(&m);
   BindCudaStream(&m);
 
@@ -741,8 +742,6 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("_promote_types_if_complex_exists",
         &paddle::framework::PromoteTypesIfComplexExists);
 
-  BindImperative(&m);
-
   py::class_<framework::Tensor> framework_tensor(m, "Tensor",
                                                  py::buffer_protocol());
   g_framework_tensor_pytype =
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
new file mode 100644
index 0000000000000..a037fa13eb53b
--- /dev/null
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -0,0 +1,294 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Python.h>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+static bool PyCheckTensor(PyObject* obj);
+static Py_ssize_t GetSliceIndexFromPyObject(PyObject* obj);
+// Slice related methods
+static bool PyCheckInteger(PyObject* obj) {
+#if PY_VERSION_HEX < 0x03000000
+  return (PyLong_Check(obj) || PyInt_Check(obj)) && !PyBool_Check(obj);
+#else
+  return PyLong_Check(obj) && !PyBool_Check(obj);
+#endif
+}
+
+static bool IsNumpyType(PyObject* obj) {
+  // It is not a good way to judge the type of obj by its type'name. Maybe using
+  // `PyArray_IsScalar` will be better. However, this interface cannot be used
+  // by including pybind11, and it needs to compile with numpy.
+  auto type_name = std::string(Py_TYPE(obj)->tp_name);
+  return type_name == "numpy.int64" || type_name == "numpy.longlong" ||
+         type_name == "numpy.int32" || type_name == "numpy.int16";
+}
+
+static Py_ssize_t GetSliceIndexFromTensor(const phi::DenseTensor& tensor) {
+  if (tensor.numel() == 1) {
+    if (framework::TransToProtoVarType(tensor.type()) ==
+        framework::proto::VarType::INT32) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int32_t>(&tensor));
+    } else if (framework::TransToProtoVarType(tensor.type()) ==
+               framework::proto::VarType::INT64) {
+      return static_cast<Py_ssize_t>(operators::GetValue<int64_t>(&tensor));
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, the type of tensor in slice indices only allows "
+          "int32 and int64, please check the type of index tensor."));
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Currently, tensor in slice indices only allows 1 element, "
+        "but received %d.",
+        tensor.numel()));
+  }
+}
+
+// NOTE(zhiqiu): Revised version of PySlice_GetIndices. From:
+// https://github.com/python/cpython/blob/8d21aa21f2cbc6d50aab3f420bb23be1d081dac4/Objects/sliceobject.c#L103
+// Original PySlice_GetIndices return wrong result when
+// slice_item contains long int, such as arr[:180L].
+// NOT sure why this happens !!!
+// Besides, PySlice_GetIndices cannot raise error when float in slice item.
+// So, I make a revised version of PySlice_GetIndices, named to
+// _PySlice_GetIndices. Try to use _PySlice_Unpack which is more robust than
+// PySlice_GetIndices in the future.
+static int _PySlice_GetIndices(PySliceObject* r, Py_ssize_t length,
+                               Py_ssize_t* start, Py_ssize_t* stop,
+                               Py_ssize_t* step) {
+  /* XXX support long ints */
+  if (r->step == Py_None) {
+    *step = 1;
+  } else {
+    if (PyCheckInteger(r->step) || IsNumpyType(r->step)) {
+      *step = PyLong_AsLong(r->step);
+    } else if (PyCheckTensor(r->step)) {
+      *step = GetSliceIndexFromPyObject(r->step);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->step)->tp_name)));
+    }
+  }
+  if (r->start == Py_None) {
+    *start = *step < 0 ? length - 1 : 0;
+  } else {
+    if (PyCheckInteger(r->start) || IsNumpyType(r->start)) {
+      *start = PyLong_AsLong(r->start);
+    } else if (PyCheckTensor(r->start)) {
+      *start = GetSliceIndexFromPyObject(r->start);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->start)->tp_name)));
+    }
+    if (*start < 0) *start += length;
+    *start = std::max(*start, static_cast<Py_ssize_t>(0));
+  }
+  if (r->stop == Py_None) {
+    *stop = *step < 0 ? -1 : length;
+  } else {
+    if (PyCheckInteger(r->stop) || IsNumpyType(r->stop)) {
+      *stop = PyLong_AsLong(r->stop);
+    } else if (PyCheckTensor(r->stop)) {
+      *stop = GetSliceIndexFromPyObject(r->stop);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, slice indices only allows None, integers, "
+          "tensor(int) and numpy(int) in slice item, but received %s.",
+          std::string(Py_TYPE(r->stop)->tp_name)));
+    }
+    if (0 < *step && *stop < 0) *stop += length;
+    *stop = std::min(*stop, length);
+  }
+  if (*stop > length) return -1;
+  if (*start >= length) return -1;
+  if (*step == 0) return -1;
+  return 0;
+}
+
+static void ParseIndexingSlice(
+    framework::LoDTensor* tensor, PyObject* _index,
+    std::vector<int>* slice_axes, std::vector<int>* slice_starts,
+    std::vector<int>* slice_ends, std::vector<int>* slice_strides,
+    std::vector<int>* decrease_axis, std::vector<int>* none_axes,
+    std::vector<int>* infer_flags, std::vector<int>* list_select_idxs,
+    bool* list_select_flag) {
+  // We allow indexing by Integers, Slices, Ellipsis, None, tuples of those
+  // types, and list of Bool and Integers.
+  // wrap to tuple
+
+  // NOTE(zhiqiu): PyTuple_Pack increases refcount.
+  PyObject* index = !PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
+  DEFINE_PADDLE_SCOPE_GUARD([index, _index]() {
+    if (!PyTuple_Check(_index)) {
+      Py_DECREF(index);
+      VLOG(4) << "Call Py_DECREF";
+    }
+  });
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("tensor has not been initialized"));
+  const auto& shape = tensor->dims();
+  const int rank = shape.size();
+  const int size = PyTuple_GET_SIZE(index);
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  int ell_count = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject* slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    } else if (slice_item == Py_Ellipsis) {
+      ell_count++;
+    }
+  }
+
+  PADDLE_ENFORCE_LE(ell_count, 1,
+                    platform::errors::InvalidArgument(
+                        "An index can only have a single ellipsis ('...')"));
+  int none_count = 0;
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject* slice_item = PyTuple_GetItem(index, i);
+
+    infer_flags->push_back(1);
+    int dim_len = shape[dim];
+    if (PyCheckInteger(slice_item) || IsNumpyType(slice_item)) {
+      // integer, PyLong_AsLong supports both int and long
+      int start = static_cast<int>(PyLong_AsLong(slice_item));
+      auto s_t = start;
+      start = start < 0 ? start + dim_len : start;
+      if (start >= dim_len || start < 0) {
+        std::string str_error_message =
+            "The starting index " + std::to_string(s_t) +
+            " of slice is out of bounds in tensor " + std::to_string(dim) +
+            "-th axis, it shound be in the range of [" +
+            std::to_string(-dim_len) + ", " + std::to_string(dim_len) + ")";
+        // py::index_error is corresponding to IndexError in Python
+        // Used to indicate out of bounds access in __getitem__, __setitem__
+        throw py::index_error(str_error_message);
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(start + 1);
+      slice_strides->push_back(1);
+      decrease_axis->push_back(dim);
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
+      // slice item
+      Py_ssize_t start, end, step;
+      PySliceObject* p = reinterpret_cast<PySliceObject*>(slice_item);
+      _PySlice_GetIndices(p, dim_len, &start, &end, &step);
+
+      // :: or : or 0:dim_len:1
+      if (start == 0 && end == dim_len && step == 1) {
+        dim++;
+        continue;
+      }
+      slice_axes->push_back(dim);
+      slice_starts->push_back(start);
+      slice_ends->push_back(end);
+      slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else if (slice_item == Py_None) {
+      none_axes->push_back(dim + none_count);
+      none_count++;
+    } else if (PyList_Check(slice_item)) {
+      *list_select_flag = true;
+      PADDLE_ENFORCE_EQ(
+          size, 1,
+          platform::errors::InvalidArgument(
+              "When index contains a list, its length is excepted to 1, "
+              "but received %d",
+              size));
+      bool all_bool = true;
+      int list_size = PyList_GET_SIZE(slice_item);
+      for (int j = 0; j < list_size; ++j) {
+        PyObject* list_item = PyList_GetItem(slice_item, j);
+        if (PyCheckInteger(list_item)) {
+          all_bool = false;
+        } else if (!PyBool_Check(list_item)) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Only support int or bool in index list."));
+        }
+      }
+      if (all_bool) {
+        PADDLE_ENFORCE_EQ(
+            list_size, shape[0],
+            platform::errors::InvalidArgument(
+                "The dimension of bool index doesn't match indexed array along "
+                "dimension 0, the target dimension is %d, but received %d.",
+                shape[0], list_size));
+
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (list_item == Py_True) {
+            list_select_idxs->push_back(j);
+          }
+        }
+      } else {
+        for (int j = 0; j < list_size; ++j) {
+          PyObject* list_item = PyList_GetItem(slice_item, j);
+          if (PyCheckInteger(list_item)) {
+            list_select_idxs->push_back(
+                static_cast<int>(PyLong_AsLong(list_item)));
+          } else if (list_item == Py_True) {
+            list_select_idxs->push_back(1);
+          } else {
+            list_select_idxs->push_back(0);
+          }
+        }
+      }
+
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, Tensor.__indices__() only allows indexing "
+          "by Integers, Slices, Ellipsis, None, tuples of these types "
+          "and list of Bool and Integers, but received "
+          "%s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
+    }
+  }
+
+  // valid_index is the number of dimensions exclude None index
+  const int valid_indexs = size - none_axes->size() - ell_count;
+  PADDLE_ENFORCE_EQ(valid_indexs <= rank, true,
+                    platform::errors::InvalidArgument(
+                        "Too many indices (%d) for tensor of dimension %d.",
+                        valid_indexs, rank));
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index f75a0fa50a59c..807f7c151964e 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -351,10 +351,10 @@ endif()
 
 set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
 set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
-set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
-set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 200)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 200)
 if(LINUX AND WITH_MKLDNN)
     set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
     set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index c4318b8bf8ef6..7b9cd7958b2d3 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -26,7 +26,7 @@
 import paddle.fluid.layers as layers
 from paddle.fluid import core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrGraph, _test_eager_guard
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.dygraph.container import Sequential
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
@@ -122,7 +122,7 @@ def forward(self, inputs):
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
 
@@ -166,9 +166,14 @@ def test_out_scale_acc(self):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-    def test_save_quantized_model(self):
+    def func_save_quantized_model(self):
         lr = 0.001
 
         load_param_path = "test_save_quantized_model/lenet.pdparams"
@@ -206,6 +211,11 @@ def test_save_quantized_model(self):
                 loss_list[i] > loss_list[i + 1],
                 msg='Failed to do the imperative qat.')
 
+    def test_save_quantized_model(self):
+        with _test_eager_guard():
+            self.func_save_quantized_model()
+        self.func_save_quantized_model()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index fb92b12cb0d87..fad4c8f9d580b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -29,6 +29,7 @@
 from paddle.fluid.contrib.slim.quantization import *
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
+from paddle.fluid.framework import _test_eager_guard
 
 from imperative_test_utils import fix_model_dict, ImperativeLenet, ImperativeLinearBn
 from imperative_test_utils import ImperativeLinearBn_hook
@@ -194,7 +195,7 @@ def program_test(self, program_path, batch_num=-1, batch_size=8):
                 break
         return top1_correct_num / total_num
 
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -244,9 +245,14 @@ def test_ptq(self):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQfuse(TestImperativePTQ):
-    def test_ptq(self):
+    def func_ptq(self):
         start_time = time.time()
 
         self.set_vars()
@@ -305,6 +311,11 @@ def test_ptq(self):
         end_time = time.time()
         print("total time: %ss \n" % (end_time - start_time))
 
+    def test_ptq(self):
+        with _test_eager_guard():
+            self.func_ptq()
+        self.func_ptq()
+
 
 class TestImperativePTQHist(TestImperativePTQ):
     def set_vars(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 677ccb52e242c..5db720b028ffe 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -32,7 +32,7 @@
 from paddle.fluid.log_helper import get_logger
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX
 from paddle.nn.quant.quant_layers import QuantizedConv2D, QuantizedConv2DTranspose
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 paddle.enable_static()
@@ -55,7 +55,7 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         print('weight_quantize_type', self.weight_quantize_type)
 
-    def test_qat(self):
+    def func_qat(self):
         self.set_vars()
 
         imperative_qat = ImperativeQuantAware(
@@ -193,6 +193,11 @@ def test_qat(self):
                 np.allclose(after_save, before_save.numpy()),
                 msg='Failed to save the inference quantized model.')
 
+    def test_qat(self):
+        with _test_eager_guard():
+            self.func_qat()
+        self.func_qat()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index d1bf76f472465..2dcf7a6f168e2 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -27,7 +27,7 @@
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
 from paddle.fluid.log_helper import get_logger
 from paddle.dataset.common import download
-
+from paddle.fluid.framework import _test_eager_guard
 from imperative_test_utils import fix_model_dict, ImperativeLenet
 
 os.environ["CPU_NUM"] = "1"
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
index 270e8ee566ab5..0bc80694a12cb 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_user_defined.py
@@ -30,7 +30,7 @@
 from paddle.fluid.dygraph import Linear
 from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
 from paddle.fluid.log_helper import get_logger
-
+from paddle.fluid.framework import _test_eager_guard
 os.environ["CPU_NUM"] = "1"
 
 _logger = get_logger(
@@ -157,7 +157,7 @@ def setUp(self):
         _logger.info("test act_preprocess")
         self.imperative_qat = ImperativeQuantAware(act_preprocess_layer=PACT)
 
-    def test_quant_aware_training(self):
+    def func_quant_aware_training(self):
         imperative_qat = self.imperative_qat
         seed = 1
         np.random.seed(seed)
@@ -243,6 +243,11 @@ def test(model):
         train(lenet)
         test(lenet)
 
+    def test_quant_aware_training(self):
+        with _test_eager_guard():
+            self.func_quant_aware_training()
+        self.func_quant_aware_training()
+
 
 class TestUserDefinedWeightPreprocess(TestUserDefinedActPreprocess):
     def setUp(self):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
index 8d2e0f753c016..d77134d72a959 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py
@@ -32,6 +32,7 @@
 from paddle.fluid.log_helper import get_logger
 
 from imperative_test_utils import fix_model_dict, train_lenet, ImperativeLenetWithSkipQuant
+from paddle.fluid.framework import _test_eager_guard
 
 os.environ["CPU_NUM"] = "1"
 if core.is_compiled_with_cuda():
@@ -42,7 +43,8 @@
 
 
 class TestImperativeOutSclae(unittest.TestCase):
-    def test_out_scale_acc(self):
+    def func_out_scale_acc(self):
+        paddle.disable_static()
         seed = 1000
         lr = 0.1
 
@@ -125,6 +127,11 @@ def test_out_scale_acc(self):
         if find_matmul:
             self.assertTrue(matmul_skip_count == 1)
 
+    def test_out_scale_acc(self):
+        with _test_eager_guard():
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index 8c2ff140ea4d5..8149d69d36a27 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -99,18 +99,19 @@ def param_guard(parameters):
         yield
 
 
-def _convert_into_variable(var_base):
+def _convert_into_variable(tensor):
     """
     Convert Varbase into Variable.
     """
-    if isinstance(var_base, core.VarBase):
+    if isinstance(tensor, (core.eager.Tensor, core.VarBase)):
         # Check whether has been created before.
-        new_var = var_base.block._find_var_recursive(var_base.name)
+        new_var = tensor.block._find_var_recursive(tensor.name)
         if new_var is not None:
             assert isinstance(new_var, framework.Variable)
         # Convert ParamBase into Parameter with same attributes in dy2stat.
-        elif isinstance(var_base, framework.ParamBase):
-            new_var = var_base._to_static_var(to_parameter=True)
+        elif isinstance(tensor,
+                        (framework.EagerParamBase, framework.ParamBase)):
+            new_var = tensor._to_static_var(to_parameter=True)
         else:
             # Note(Aurelius84): Convert VarBase in self._buffers into Variable with
             # same attributes and set persistable=True to allow saving this var.
@@ -120,13 +121,13 @@ def _convert_into_variable(var_base):
 
             # But if its shape is empty while created from `create_variable()`, we consider this buffer
             # non-persistable. See case of `drop_state` in lstm api.
-            is_persistable = len(var_base.shape) > 0
+            is_persistable = len(tensor.shape) > 0
 
-            new_var = var_base._to_static_var(
+            new_var = tensor._to_static_var(
                 to_parameter=False, persistable=is_persistable)
         return new_var
     else:
-        return var_base
+        return tensor
 
 
 def enabled():
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 94fc5558ab162..a442a8b92b6f7 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -61,7 +61,8 @@ def restore(self, value_list):
     def _get_var_ids(self):
         var_ids = []
         for idx, var in enumerate(self.__input_list):
-            if isinstance(var, (framework.Variable, core.VarBase)):
+            if isinstance(var, (framework.Variable, core.VarBase,
+                                core.eager.Tensor)):
                 var_ids.append(idx)
 
         return var_ids
@@ -73,7 +74,8 @@ def _check_non_variable(self, need_check):
         if need_check:
             warning_types = set()
             for var in self.__input_list:
-                if not isinstance(var, (framework.Variable, core.VarBase)):
+                if not isinstance(var, (framework.Variable, core.VarBase,
+                                        core.eager.Tensor)):
                     warning_types.add(type(var))
             if warning_types:
                 logging_utils.warn(
@@ -301,10 +303,17 @@ def _get_double_grads(self, program):
             for name in block.vars:
                 if "@GRAD" in name:
                     var_desc = block.vars[name].desc
-                    var_base = core.VarBase(var_desc.dtype(),
-                                            var_desc.shape(),
-                                            var_desc.name(),
-                                            var_desc.type(), False)
+                    var_base = None
+                    if not core._in_eager_mode():
+                        var_base = core.VarBase(var_desc.dtype(),
+                                                var_desc.shape(),
+                                                var_desc.name(),
+                                                var_desc.type(), False)
+                    else:
+                        var_base = core.eager.Tensor(var_desc.dtype(),
+                                                     var_desc.shape(),
+                                                     var_desc.name(),
+                                                     var_desc.type(), False)
                     double_grads.append(var_base)
         return self._valid_vars(double_grads)
 
@@ -386,13 +395,22 @@ def _prepare(self, inputs):
         expected_place = framework._current_expected_place()
         for i, value in enumerate(flatten_inputs):
             if isinstance(value, np.ndarray):
-                var = core.VarBase(
-                    value=value,
-                    name=self._inputs[i].desc.name(),
-                    persistable=False,
-                    place=expected_place,
-                    zero_copy=True)
-            elif isinstance(value, core.VarBase):
+                var = None
+                if not core._in_eager_mode():
+                    var = core.VarBase(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+                else:
+                    var = core.eager.Tensor(
+                        value=value,
+                        name=self._inputs[i].desc.name(),
+                        persistable=False,
+                        place=expected_place,
+                        zero_copy=True)
+            elif isinstance(value, (core.VarBase, core.eager.Tensor)):
                 # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times
                 # into CUDAPlace when it's as input of multi Ops. so we move it in advance
                 # to avoid this problem.
@@ -411,9 +429,16 @@ def create_out(var_id):
             var = self._outputs[var_id]
             assert isinstance(var, framework.Variable)
             var_desc = var.desc
-            var_base = core.VarBase(var_desc.dtype(),
-                                    var_desc.shape(),
-                                    var_desc.name(), var_desc.type(), False)
+            varbase = None
+            if not core._in_eager_mode():
+                var_base = core.VarBase(var_desc.dtype(),
+                                        var_desc.shape(),
+                                        var_desc.name(), var_desc.type(), False)
+            else:
+                var_base = core.eager.Tensor(var_desc.dtype(),
+                                             var_desc.shape(),
+                                             var_desc.name(),
+                                             var_desc.type(), False)
             return var_base
 
         # Create VarBase to receive output data.
@@ -423,12 +448,19 @@ def create_out(var_id):
 
     def _create_scope_vec(self):
         # Hold forward variables
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
-
-        inner_scope = core.Scope()
-        tmp_scope_vec.value().set_scope(inner_scope)
+        tmp_scope_vec = None
+        if not core._in_eager_mode():
+            tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
+                                         "program_out_scope",
+                                         core.VarDesc.VarType.STEP_SCOPES, True)
+            # TODO(jiabin): Support this later.
+            # else:
+            #     tmp_scope_vec = core.eager.Tensor(core.VarDesc.VarType.FP32, [],
+            #                                 "program_out_scope",
+            #                                 core.VarDesc.VarType.STEP_SCOPES, True)
+
+            inner_scope = core.Scope()
+            tmp_scope_vec.value().set_scope(inner_scope)
         return tmp_scope_vec
 
     def _restore_out(self, out_vars):
@@ -450,7 +482,8 @@ def _clone_for_test(self, main_program):
         return main_program.clone(for_test=True)
 
     def _is_no_value(self, var):
-        if isinstance(var, core.VarBase) and var.shape == [1]:
+        if isinstance(var,
+                      (core.VarBase, core.eager.Tensor)) and var.shape == [1]:
             # NOTE: .numpy() will insert MemcpySync operation, it hits performance.
             if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM:
                 return True
@@ -460,7 +493,7 @@ def _remove_no_value(self, out_vars):
         """
         Removes invalid value for various-length return statement
         """
-        if isinstance(out_vars, core.VarBase):
+        if isinstance(out_vars, (core.VarBase, core.eager.Tensor)):
             if self._is_no_value(out_vars):
                 return None
             return out_vars
@@ -527,7 +560,7 @@ def _check_params_all_inited(self, main_program):
         param_and_buffer_names_set = set()
         for i, var in enumerate(self._params):
             # self._params constains parameters and buffers with persistable=True.
-            if not isinstance(var, core.VarBase):
+            if not isinstance(var, (core.VarBase, core.eager.Tensor)):
                 raise TypeError(
                     'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.
                     format(i, type(var)))
@@ -559,10 +592,18 @@ def _create_fake_var():
     """
     Create a fake_var (force on CPU) to handle empty input or output
     """
-    return [
-        core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                     core.VarDesc.VarType.RAW, False)
-    ]
+    if not core._in_eager_mode():
+        return [
+            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
+                         core.VarDesc.VarType.RAW, False)
+        ]
+    else:
+        return []
+        # TODO(jiabin): Support this later
+        # return [
+        #     core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
+        #                 core.VarDesc.VarType.RAW, False)
+        # ]
 
 
 def partial_program_from(concrete_program):
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4bfdc3c27fad6..b1865691b2475 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,7 +25,7 @@
 
 import six
 import paddle
-from paddle.fluid import core
+from paddle.fluid import core, dygraph
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
 from paddle.fluid.layers.utils import flatten, pack_sequence_as
@@ -898,30 +898,33 @@ def fun(inputs):
                 state_var_dict[var.name] = var
 
             # 3. share parameters from Layer to scope & record var info
-            for param_or_buffer in concrete_program.parameters:
-                # share to scope
-                if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
-                    scr_tensor = param_or_buffer.value().get_map_tensor()
-                    tgt_var = scope.var(param_or_buffer.name)
-                    tgt_var.set_vocab(scr_tensor)
-                else:
-                    param_or_buffer_tensor = scope.var(
-                        param_or_buffer.name).get_tensor()
-                    #src_tensor = param_or_buffer.value().get_tensor()
-                    src_tensor = state_var_dict[param_or_buffer.name].value(
-                    ).get_tensor()
-                    param_or_buffer_tensor._share_data_with(src_tensor)
-                # record var info
-                if param_or_buffer.name not in extra_var_info:
-                    extra_info_dict = dict()
-                    if param_or_buffer.name in state_names_dict:
-                        extra_info_dict['structured_name'] = state_names_dict[
-                            param_or_buffer.name]
-                    extra_info_dict[
-                        'stop_gradient'] = param_or_buffer.stop_gradient
-                    if isinstance(param_or_buffer, ParamBase):
-                        extra_info_dict['trainable'] = param_or_buffer.trainable
-                    extra_var_info[param_or_buffer.name] = extra_info_dict
+            with dygraph.guard():
+                for param_or_buffer in concrete_program.parameters:
+                    # share to scope
+                    if param_or_buffer.type == core.VarDesc.VarType.VOCAB:
+                        scr_tensor = param_or_buffer.value().get_map_tensor()
+                        tgt_var = scope.var(param_or_buffer.name)
+                        tgt_var.set_vocab(scr_tensor)
+                    else:
+                        param_or_buffer_tensor = scope.var(
+                            param_or_buffer.name).get_tensor()
+                        #src_tensor = param_or_buffer.value().get_tensor()
+                        src_tensor = state_var_dict[param_or_buffer.name].value(
+                        ).get_tensor()
+                        param_or_buffer_tensor._share_data_with(src_tensor)
+                    # record var info
+                    if param_or_buffer.name not in extra_var_info:
+                        extra_info_dict = dict()
+                        if param_or_buffer.name in state_names_dict:
+                            extra_info_dict[
+                                'structured_name'] = state_names_dict[
+                                    param_or_buffer.name]
+                        extra_info_dict[
+                            'stop_gradient'] = param_or_buffer.stop_gradient
+                        if isinstance(param_or_buffer, ParamBase):
+                            extra_info_dict[
+                                'trainable'] = param_or_buffer.trainable
+                        extra_var_info[param_or_buffer.name] = extra_info_dict
 
         # 4. build input & output of save_infernece_model
         # NOTE(chenweihang): [ Get input variables name ]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 65bfba3f6c32e..6843c0e4c3fa8 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -94,7 +94,7 @@ def _to_static_var(self, to_parameter=False, **kwargs):
         # Note: getattr(self, attr, None) will call x.grad=x.gradient(), but gradient() only available in dygraph.
         # It will fail. So, for propery that different between dynamic and static graph, should not getattr(self, attr, None).
         attr_not_need_keys = ['grad', 'T']
-        if isinstance(self, ParamBase):
+        if isinstance(self, (ParamBase, EagerParamBase)):
             attr_kwargs = self.__dict__.copy()
         else:
             attr_names = []
@@ -111,7 +111,7 @@ def _to_static_var(self, to_parameter=False, **kwargs):
 
         attr_kwargs.update(kwargs)
 
-        if to_parameter or isinstance(self, ParamBase):
+        if to_parameter or isinstance(self, (ParamBase, EagerParamBase)):
             del attr_kwargs['persistable']
             # NOTE(Aurelius84): All parameters should be placed into global block.
             attr_kwargs['block'] = attr_kwargs['block'].program.global_block()
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 4bbc0ba03c934..a48cfd9150c65 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1821,7 +1821,7 @@ def _pack_loaded_dict(load_obj):
 @static_only
 def _legacy_save(param_dict, model_path, protocol=2):
     def get_tensor(var):
-        if isinstance(var, core.VarBase):
+        if isinstance(var, (core.VarBase, core.eager.Tensor)):
             return var.numpy()
         elif isinstance(var, core.LoDTensor):
             return np.array(var)
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f022e1791daef..fd7226c48661f 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -10148,6 +10148,9 @@ def flatten(x, axis=1, name=None):
     check_variable_and_dtype(
         x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
         'flatten')
+    if in_dygraph_mode():
+        return _C_ops.flatten2(x, 'axis', axis)[0]
+
     helper = LayerHelper('flatten', **locals())
 
     if not (isinstance(x, Variable)):
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 76414ea942465..c63ad42288fd0 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -663,7 +663,9 @@ def assign(input, output=None):
             })
 
     if is_inplace and in_dygraph_mode():
-        output._bump_inplace_version()
+        # TODO(jiabin): Remove this when we support inplace
+        if not core._in_eager_mode():
+            output._bump_inplace_version()
 
     return output
 
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 252482fa6d270..156fdcb9b0abe 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -771,13 +771,13 @@ def test_set_value(self):
             self.assertTrue(np.array_equal(egr_tensor.numpy(), ori_arr))
             ori_place = egr_tensor.place
 
-            new_arr = np.random.rand(4, 4, 16, 32).astype('float32')
+            new_arr = np.random.rand(4, 16, 16, 32).astype('float32')
             self.assertFalse(np.array_equal(egr_tensor.numpy(), new_arr))
 
-            egr_tensor._set_value(new_arr)
+            egr_tensor.set_value(new_arr)
             self.assertEqual(egr_tensor.stop_gradient, True)
             self.assertTrue(egr_tensor.place._equals(ori_place))
-            self.assertEqual(egr_tensor.shape, [4, 4, 16, 32])
+            self.assertEqual(egr_tensor.shape, [4, 16, 16, 32])
             self.assertTrue(np.array_equal(egr_tensor.numpy(), new_arr))
 
 
@@ -880,7 +880,7 @@ def test_set_value(self):
             new_weight = np.ones([1, 3]).astype('float32')
             self.assertFalse(np.array_equal(linear.weight.numpy(), new_weight))
 
-            linear.weight._set_value(new_weight)
+            linear.weight.set_value(new_weight)
             self.assertTrue(np.array_equal(linear.weight.numpy(), new_weight))
             self.assertTrue(linear.weight.place._equals(ori_place))
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
index 3238876b89414..aac8b6a99b649 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py
@@ -533,12 +533,8 @@ def func_register_hook_in_dy2static_mode(self):
             size=[self.batch_size, self.in_size]).astype('float32')
         data_t = paddle.to_tensor(data)
 
-        if _in_eager_mode():
-            with self.assertRaises(TypeError):
-                out = jit_net(data_t)
-        else:
-            with self.assertRaises(AssertionError):
-                out = jit_net(data_t)
+        with self.assertRaises(AssertionError):
+            out = jit_net(data_t)
 
     def test_register_hook_in_dy2static_mode(self):
         with _test_eager_guard():

From 34d93bee16ece807bc8dc4f24dbbed64ab40d8fb Mon Sep 17 00:00:00 2001
From: zhangxiaoci <zhangxiaoci@baidu.com>
Date: Thu, 3 Mar 2022 10:19:49 +0800
Subject: [PATCH 38/41] bugfix in is_xpu_support_op (#40070)

---
 paddle/fluid/platform/device/xpu/xpu_op_list.cc | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 6127fcfa8def6..b20e8ac9785ca 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -23,12 +23,9 @@ namespace paddle {
 namespace platform {
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kl2_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kl2_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;
@@ -78,12 +75,9 @@ bool is_in_xpu_black_list(const std::string& op_name) {
 #ifdef PADDLE_WITH_XPU_KP
 bool is_xpu_kp_support_op(const std::string& op_name,
                           const pOpKernelType& type) {
-  auto& ops = get_kl1_ops();
   auto v = get_xpu_version(type.place_.device);
-  if (v == phi::backends::xpu::XPUVersion::XPU2) {
-    ops = get_kp_ops();
-  }
-
+  auto& ops = (v == phi::backends::xpu::XPUVersion::XPU1) ? get_kl1_ops()
+                                                          : get_kp_ops();
   if (ops.find(op_name) != ops.end() &&
       ops[op_name].find(type) != ops[op_name].end()) {
     return true;

From 815f7a670a459eea9213cfe46bfde47ad07c1efb Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:45:21 +0800
Subject: [PATCH 39/41] change_ASP_sharding_option (#40028)

---
 .../paddle/distributed/fleet/base/fleet_base.py  | 16 ++++++++++++++++
 python/paddle/fluid/contrib/sparsity/asp.py      |  9 +++++----
 .../asp/test_fleet_with_asp_sharding.py          |  2 +-
 3 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index bc59b87e2ffa5..236322ccfca6a 100755
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -1430,6 +1430,22 @@ def minimize(self,
 
         # cache original feed forward program
         self.origin_main_program = loss.block.program
+        # add distributed attr
+        if not hasattr(self.origin_main_program, "distributed_info_"):
+            setattr(self.origin_main_program, "distributed_info_", dict())
+            self.origin_main_program.distributed_info_[
+                "dp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "dp_degree"]
+            self.origin_main_program.distributed_info_[
+                "mp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "mp_degree"]
+            self.origin_main_program.distributed_info_[
+                "pp_degree"] = self._user_defined_strategy.sharding_configs[
+                    "pp_degree"]
+            self.origin_main_program.distributed_info_[
+                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
+                    "sharding_degree"]
+
         context["origin_main_program"] = self.origin_main_program
         context["loss"] = loss
         if startup_program == None:
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 937fcdf0463be..ffa12ac704600 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -155,8 +155,7 @@ def prune_model(main_program=None,
                 n=2,
                 m=4,
                 mask_algo='mask_1d',
-                with_mask=True,
-                sharding=False):
+                with_mask=True):
     r"""
     Pruning parameters of supported layers in :attr:`main_program` via 
     specified mask generation function given by :attr:`mask_algo`. This 
@@ -179,7 +178,6 @@ def prune_model(main_program=None,
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
         with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
-        sharding (bool, optional): Whether to turn on sharding (model parallel) during training. Please consider turning it ON when encountering OOM using sharding. Default is False.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
@@ -221,7 +219,10 @@ def prune_model(main_program=None,
             # Must call `exe.run(startup_program)` first before calling `sparsity.prune_model`
             sparsity.prune_model(main_program, mask_algo='mask_2d_best')
     """
-    if sharding:
+    if main_program is not None and hasattr(
+            main_program,
+            "distributed_info_") and main_program.distributed_info_[
+                "sharding_degree"] > 1 and paddle.fluid.is_compiled_with_cuda():
         gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
         place = paddle.CUDAPlace(gpu_id)
     else:
diff --git a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
index 26170015ae8c2..d9ddd6c88d727 100644
--- a/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
+++ b/python/paddle/fluid/tests/unittests/asp/test_fleet_with_asp_sharding.py
@@ -98,7 +98,7 @@ def test_with_asp_sharding(self):
         feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
         exe.run(startup_prog)
 
-        sparsity.prune_model(train_prog, sharding=True)
+        sparsity.prune_model(train_prog)
 
         data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
         exe.run(train_prog, feed=feeder.feed([data]))

From 00bbb8c59a2150e4cda68e0fae7a362e5cb663f5 Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:51:17 +0800
Subject: [PATCH 40/41] [Phi] move gaussian_random (#39932)

[Phi] move gaussian_random kernel
---
 paddle/fluid/operators/gaussian_random_op.cc  |  23 ----
 paddle/fluid/operators/gaussian_random_op.cu  |  52 --------
 .../phi/kernels/cpu/gaussian_random_kernel.cc |  53 +++++++++
 paddle/phi/kernels/gaussian_random_kernel.h   |  32 +++++
 .../phi/kernels/gpu/gaussian_random_kernel.cu | 111 ++++++++++++++++++
 paddle/phi/ops/compat/gaussian_random_sig.cc  |  45 +++++++
 6 files changed, 241 insertions(+), 75 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/gaussian_random_kernel.cc
 create mode 100644 paddle/phi/kernels/gaussian_random_kernel.h
 create mode 100644 paddle/phi/kernels/gpu/gaussian_random_kernel.cu
 create mode 100644 paddle/phi/ops/compat/gaussian_random_sig.cc

diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc
index 774ff0bd06599..6b559885c569d 100644
--- a/paddle/fluid/operators/gaussian_random_op.cc
+++ b/paddle/fluid/operators/gaussian_random_op.cc
@@ -26,27 +26,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    float mean = context.Attr<float>("mean");
-    float std = context.Attr<float>("std");
-    auto* tensor = context.Output<framework::Tensor>("Out");
-
-    std::normal_distribution<T> dist(mean, std);
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-    int64_t size = tensor->numel();
-    T* data = tensor->mutable_data<T>(context.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    auto engine = framework::GetCPURandomEngine(seed);
-
-    for (int64_t i = 0; i < size; ++i) {
-      data[i] = dist(*engine);
-    }
-  }
-};  // namespace operators
 
 template <typename T>
 class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
@@ -194,8 +173,6 @@ Used to initialize tensors with gaussian random generator.
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
                              ops::GaussianRandomOpMaker);
-REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>,
-                       ops::CPUGaussianRandomKernel<double>);
 REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<float>,
                        ops::CPUGaussianRandomBatchSizeLikeKernel<double>);
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index 21d827c79200c..d419bd70e67db 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -52,53 +52,6 @@ struct GaussianGenerator {
   }
 };
 
-template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* tensor = context.Output<framework::Tensor>("Out");
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-    T mean = static_cast<T>(context.Attr<float>("mean"));
-    T std = static_cast<T>(context.Attr<float>("std"));
-    auto shape = GetShape(context);
-    tensor->Resize(shape);
-
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-
-    int64_t size = tensor->numel();
-
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::normal_distribution<MT> dist;
-        distribution::normal_transform<MT> trans(mean, std);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        auto func =
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
-        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-      }
-    } else {
-      auto func = GaussianGenerator<T>(mean, std, seed);
-      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
-    }
-  }
-};
-
 template <typename T>
 class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
  public:
@@ -136,11 +89,6 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(
-    gaussian_random,
-    paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>,
-    paddle::operators::GPUGaussianRandomKernel<float>,
-    paddle::operators::GPUGaussianRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(
     gaussian_random_batch_size_like,
     paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
diff --git a/paddle/phi/kernels/cpu/gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
new file mode 100644
index 0000000000000..7e336f18bf80a
--- /dev/null
+++ b/paddle/phi/kernels/cpu/gaussian_random_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  std::normal_distribution<T> dist(mean, std);
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+  int64_t size = tensor->numel();
+  T* data = dev_ctx.template Alloc<T>(tensor);
+  auto engine = paddle::framework::GetCPURandomEngine(seed);
+
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gaussian_random_kernel.h b/paddle/phi/kernels/gaussian_random_kernel.h
new file mode 100644
index 0000000000000..2903d80d22d46
--- /dev/null
+++ b/paddle/phi/kernels/gaussian_random_kernel.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
new file mode 100644
index 0000000000000..d5acc60a36097
--- /dev/null
+++ b/paddle/phi/kernels/gpu/gaussian_random_kernel.cu
@@ -0,0 +1,111 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/gaussian_random_kernel.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#include "paddle/fluid/framework/generator.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+  unsigned int offset_ = 0;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed, int offset)
+      : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+    thrust::normal_distribution<MT> dist(mean_, std_);
+    unsigned int new_n = n + offset_;
+    rng.discard(new_n);
+    MT out = dist(rng);
+    return static_cast<T>(out);
+  }
+};
+
+template <typename T, typename Context>
+void GaussianRandomKernel(const Context& dev_ctx,
+                          const ScalarArray& shape,
+                          float mean,
+                          float std,
+                          int seed,
+                          DataType dtype,
+                          DenseTensor* out) {
+  auto tensor = out;
+
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  tensor->Resize(phi::make_ddim(shape.GetData()));
+
+  T* data = dev_ctx.template Alloc<T>(tensor);
+
+  int64_t size = tensor->numel();
+
+  int device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id);
+
+  using MT = typename phi::kps::details::MPTypeTrait<T>::Type;
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      funcs::normal_distribution<MT> dist;
+      funcs::normal_transform<MT> trans(mean, std);
+      funcs::distribution_and_transform<T>(dev_ctx, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          GaussianGenerator<MT>(mean, std, seed_offset.first, gen_offset);
+      IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+    }
+  } else {
+    auto func = GaussianGenerator<MT>(mean, std, seed);
+    IndexKernel<T, GaussianGenerator<MT>>(dev_ctx, tensor, func);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(gaussian_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GaussianRandomKernel,
+                   phi::dtype::float16,
+                   float,
+                   double) {}
diff --git a/paddle/phi/ops/compat/gaussian_random_sig.cc b/paddle/phi/ops/compat/gaussian_random_sig.cc
new file mode 100644
index 0000000000000..cddcb80ebea3d
--- /dev/null
+++ b/paddle/phi/ops/compat/gaussian_random_sig.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GaussianRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("ShapeTensorList") > 0) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensorList", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  const auto& shape = paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+  if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+    return KernelSignature("gaussian_random",
+                           {},
+                           {"ShapeTensor", "mean", "std", "seed", "dtype"},
+                           {"Out"});
+  }
+
+  return KernelSignature("gaussian_random",
+                         {},
+                         {"shape", "mean", "std", "seed", "dtype"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(gaussian_random,
+                           phi::GaussianRandomOpArgumentMapping);

From 3779e8077faad2f986f1c251265e82e6ab667582 Mon Sep 17 00:00:00 2001
From: crystal <62974595+Zjq9409@users.noreply.github.com>
Date: Thu, 3 Mar 2022 10:55:00 +0800
Subject: [PATCH 41/41] move gather_tree infer shape (#40082)

---
 paddle/fluid/operators/gather_tree_op.cc | 23 ++++++++---------------
 paddle/phi/infermeta/binary.cc           | 13 +++++++++++++
 paddle/phi/infermeta/binary.h            |  4 ++++
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc
index 2868c3697eda1..7f6c82032fe39 100644
--- a/paddle/fluid/operators/gather_tree_op.cc
+++ b/paddle/fluid/operators/gather_tree_op.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace paddle {
 namespace operators {
@@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree");
-
-    auto ids_dims = ctx->GetInputDim("Ids");
-    auto parents_dims = ctx->GetInputDim("Parents");
-    PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true,
-                      platform::errors::InvalidArgument(
-                          "The shape of Input(Parents) must be same with the "
-                          "shape of Input(Ids)."));
-    ctx->SetOutputDim("Out", ids_dims);
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -72,4 +61,8 @@ selected ids.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker);
+DELCARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor,
+                            PT_INFER_META(phi::GatherTreeMeta));
+
+REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker,
+                  GatherTreeInferShapeFunctor);
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 675e68af74339..7682f6b3d49b9 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -348,4 +348,17 @@ void BCELossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out) {
+  auto ids_dims = ids.dims();
+  auto parents_dims = parents.dims();
+  PADDLE_ENFORCE_EQ(ids_dims == parents_dims,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "The shape of Input(Parents) must be same with the "
+                        "shape of Input(Ids)."));
+  out->set_dims(ids_dims);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index a0140c9a5799f..5906e06b29355 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -68,4 +68,8 @@ void BCELossInferMeta(const MetaTensor& input,
                       const MetaTensor& label,
                       MetaTensor* out,
                       MetaConfig config = MetaConfig());
+
+void GatherTreeMeta(const MetaTensor& ids,
+                    const MetaTensor& parents,
+                    MetaTensor* out);
 }  // namespace phi