From bec9fc9a902daf5f6669f1a34067f3411da21cc7 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Wed, 29 Sep 2021 11:51:20 +0800
Subject: [PATCH 01/80] [hybrid] Fix model parallel non-distributed param
 broadcast (#36186)

---
 .../sharding/offload_helper.py                | 48 ++++++----
 .../meta_optimizers/sharding_optimizer.py     | 96 ++++++++++++-------
 .../test_fleet_hybrid_meta_optimizer.py       | 16 ++--
 .../test_fleet_sharding_meta_optimizer.py     | 14 +--
 4 files changed, 105 insertions(+), 69 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
index 3ad6e320316c6..bb6af1b3195f7 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py
@@ -25,8 +25,9 @@ class OffloadHelper(object):
     cuda_place_type = 1
     cuda_pinned_place_type = 2
 
-    def __init__(self, ring_id=None):
-        self.ring_id = ring_id
+    def __init__(self, mp_ring_id=None, dp_ring_id=None):
+        self.mp_ring_id = mp_ring_id
+        self.dp_ring_id = dp_ring_id
 
     def _insert_cast_op(self, block, idx, src_name, dst_name):
         src_var = block.var(src_name)
@@ -49,20 +50,31 @@ def _insert_cast_op(self, block, idx, src_name, dst_name):
                 OP_ROLE_KEY: OpRole.Optimize
             })
 
-    def _insert_broadcast_op(self, block, idx, param):
-        if self.ring_id is None:
-            return
-        block._insert_op_without_sync(
-            idx,
-            type="c_broadcast",
-            inputs={'X': param},
-            outputs={'Out': param},
-            attrs={
-                'ring_id': self.ring_id,
-                'root': 0,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Forward,
-            })
+    def _insert_broadcast_op(self, block, idx, param_name):
+        rings = []
+
+        if self.dp_ring_id is not None:
+            rings.append(self.dp_ring_id)
+
+        # need sync non distributed param in mp group
+        if self.mp_ring_id is not None:
+            param = block.var(param_name)
+            if not hasattr(param, 'is_distributed') or not param.is_distributed:
+                rings.append(self.mp_ring_id)
+
+        # the insert op order is: mp, dp
+        for ring in rings:
+            block._insert_op_without_sync(
+                idx,
+                type="c_broadcast",
+                inputs={'X': param_name},
+                outputs={'Out': param_name},
+                attrs={
+                    'ring_id': ring,
+                    'root': 0,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Forward,
+                })
 
     def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type):
         src_var = block.var(src_name)
@@ -236,7 +248,7 @@ def remove_param(input_name):
                     self._insert_cast_op(startup_block, insert_idx, var_name,
                                          param_to_fp16[var_name])
                     # NOTE(wangxi): cast and offload should insert after broadcast param.
-                    # the insert op order is: broadcast, cast, offload
+                    # the insert op order is: {mp, dp}broadcast, cast, offload
                     self._insert_broadcast_op(startup_block, insert_idx,
                                               var_name)
 
@@ -489,6 +501,8 @@ def remove_param(input_name):
                     self._insert_cast_op(startup_block, insert_idx, var_name,
                                          param_to_fp16[var_name])
 
+                    # NOTE(wangxi): cast and offload should insert after broadcast param.
+                    # the insert op order is: {mp, dp}broadcast, cast, offload
                     self._insert_broadcast_op(startup_block, insert_idx,
                                               var_name)
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 75a69e5527bc1..18211459a4e08 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -467,14 +467,16 @@ def _apply_optimize_offload_pass(self, params_grads):
         main_block = self._main_program.global_block()
         startup_block = self._startup_program.global_block()
 
+        mp_ring_id = self.mp_ring_id if self.mp_degree > 1 else None
         dp_ring_id = self.dp_ring_id if self.dp_degree > 1 else None
+        offload_helper = OffloadHelper(
+            mp_ring_id=mp_ring_id, dp_ring_id=dp_ring_id)
 
         # optimize offload should be enable while gradient merge is enable and
         # acc_step is quite large (e.g. >> 100). Since its memcpy could not be
         # overlap with calc, otherwise it will slower down training severely.
         if sharding_configs["optimize_offload"]:
             logger.info("Sharding with optimize offload !")
-            offload_helper = OffloadHelper(ring_id=dp_ring_id)
             offload_helper.offload(main_block, startup_block)
             # The optimize_cast is already included in offload_fp32param
             offload_helper.offload_fp32param(main_block, startup_block)
@@ -482,7 +484,6 @@ def _apply_optimize_offload_pass(self, params_grads):
             logger.info("Sharding with optimize cast !")
             # NOTE(wangxi): optimize_cast will persist fp16 param, it
             # will take more memory, but will be faster. Trade space for time.
-            offload_helper = OffloadHelper(ring_id=dp_ring_id)
             if self._optimizer_sharding:
                 offload_helper.opt_sharding_cast_fp32param(
                     main_block, startup_block,
@@ -554,6 +555,10 @@ def minimize_impl(self,
         # init param broadcast should be called after startup pruning
         self._initialization_broadcast()
 
+        # NOTE(wangxi): if param is not persistable, program.clone will
+        #  failed, so we remove no persistable param, recreate param as a var
+        self._recreate_not_persist_param_as_var()
+
         self._dump_program_for_debug()
 
         # GPU need to wait server ready, GPU and NPU is Layered connection
@@ -1385,23 +1390,14 @@ def _build_groups(self):
 
         return
 
-    def _initialization_broadcast(self):
-        """
-        this funtion is to ensure the initialization between dp group to be 
-        identical when hybrid-dp is used.
-        """
-        if not self.hybrid_dp:
-            return
-
-        startup_block = self._startup_program.global_block()
-        params = startup_block.all_parameters()
-        params_name = []
+    def _recreate_not_persist_param_as_var(self):
+        def recreate_not_persist_param_as_var(program):
+            block = program.global_block()
+            params = block.all_parameters()
+            for param in params:
+                if param.persistable:
+                    continue
 
-        # NOTE(wangxi): if param is not persistable, program.clone will
-        #  failed, so we remove no persistable param, re add param as a var
-        for param in params:
-            params_name.append(param.name)
-            if not param.persistable:
                 name = param.name
                 shape = param.shape
                 dtype = param.dtype
@@ -1411,15 +1407,14 @@ def _initialization_broadcast(self):
                 trainable = param.trainable
                 optimize_attr = param.optimize_attr
                 regularizer = param.regularizer
-
                 have_dist_attr = False
                 is_distributed = False
                 if hasattr(param, 'is_distributed'):
                     have_dist_attr = True
                     is_distributed = param.is_distributed
 
-                startup_block._remove_var(name, sync=False)
-                var = startup_block.create_var(
+                block._remove_var(name, sync=False)
+                var = block.create_var(
                     name=name,
                     shape=shape,
                     dtype=dtype,
@@ -1431,6 +1426,31 @@ def _initialization_broadcast(self):
                 if have_dist_attr:
                     var.is_distributed = is_distributed
 
+            block._sync_with_cpp()
+
+        recreate_not_persist_param_as_var(self._startup_program)
+        recreate_not_persist_param_as_var(self._main_program)
+
+    def _initialization_broadcast(self):
+        """
+        this funtion is to ensure the initialization between dp group to be
+        identical when hybrid-dp is used, and the initialization of
+        not distributed param between mp group to be identical.
+        """
+        if self.dp_degree <= 1 and self.mp_degree <= 1:
+            return
+
+        startup_block = self._startup_program.global_block()
+
+        params = startup_block.all_parameters()
+        params_name = []
+        not_dist_param_name = set()
+
+        for param in params:
+            params_name.append(param.name)
+            if not hasattr(param, 'is_distributed') or not param.is_distributed:
+                not_dist_param_name.add(param.name)
+
         # offload and optimize_cast will insert broadcast op
         broadcast_params = set()
         for op in startup_block.ops:
@@ -1439,23 +1459,25 @@ def _initialization_broadcast(self):
 
         for param in params_name:
             if param in broadcast_params: continue
-            startup_block.append_op(
-                type='c_broadcast',
-                inputs={'X': param},
-                outputs={'Out': param},
-                attrs={
-                    'ring_id': self.dp_ring_id,
-                    'root': 0,
-                    'use_calc_stream': True,
-                    OP_ROLE_KEY: OpRole.Forward
-                })
 
-        startup_block.append_op(
-            type='c_sync_comm_stream',
-            inputs={'X': params_name},
-            outputs={'Out': params_name},
-            attrs={'ring_id': self.dp_ring_id,
-                   OP_ROLE_KEY: OpRole.Forward})
+            rings = []
+            # need sync not distributed param in mp group
+            if self.mp_degree > 1 and param in not_dist_param_name:
+                rings.append(self.mp_ring_id)
+            if self.dp_degree > 1:
+                rings.append(self.dp_ring_id)
+
+            for ring in rings:
+                startup_block.append_op(
+                    type='c_broadcast',
+                    inputs={'X': param},
+                    outputs={'Out': param},
+                    attrs={
+                        'ring_id': ring,
+                        'root': 0,
+                        'use_calc_stream': True,
+                        OP_ROLE_KEY: OpRole.Forward
+                    })
 
         startup_block._sync_with_cpp()
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
index 6eb566935d9d5..35b74eac4b075 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py
@@ -72,8 +72,7 @@ def test_opt_sharding_with_pp(self):
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -155,8 +154,7 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self):
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -218,7 +216,7 @@ def test_opt_sharding_with_pp_amp_gclip(self):
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -292,7 +290,7 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self):
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -371,7 +369,7 @@ def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self):
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast',
             'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast',
             'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast',
-            'cast', 'c_broadcast', 'c_sync_comm_stream'
+            'cast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -460,7 +458,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
             'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id',
-            'c_comm_init', 'c_broadcast', 'c_sync_comm_stream'
+            'c_comm_init', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -511,7 +509,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 73eacd118ecad..7cb033b748874 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -655,7 +655,9 @@ def test_hybrid_with_mp_pp_amp_gclip(self):
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
-            'c_gen_nccl_id', 'c_comm_init'
+            'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -764,7 +766,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self):
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -932,7 +934,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self):
             'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast',
             'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast',
             'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast',
-            'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1029,7 +1031,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self):
             'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy',
             'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy',
             'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy',
-            'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1129,7 +1131,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse(
             'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast',
             'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast',
             'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast',
-            'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -1221,7 +1223,7 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self):
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
             'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast',
             'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
-            'c_broadcast', 'c_broadcast', 'c_sync_comm_stream'
+            'c_broadcast', 'c_broadcast'
         ])
 
         self.assertEqual(main_prog_op_types, [

From 7bddf2e88fe1ee64cf695b4198cc398504cf90b5 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Wed, 29 Sep 2021 14:42:51 +0800
Subject: [PATCH 02/80] [NPU] mod for model bert (#36165)

* merge conflict of paddle_gtest_main.cc

* modify FLAGS_npu_precision_mode and default not to call aclSetCompileopt
---
 .../elementwise/elementwise_sub_op_npu.cc     |   4 +-
 .../fluid/operators/fill_any_like_op_npu.cc   |  12 +-
 paddle/fluid/operators/npu_op_runner.cc       |   8 +
 paddle/fluid/operators/slice_op_npu.cc        |  27 ++-
 paddle/fluid/platform/flags.cc                |   7 +
 .../npu/test_elementwise_sub_op_npu.py        |   5 +
 .../npu/test_fill_any_like_op_npu.py          |   6 +
 .../tests/unittests/npu/test_slice_op_npu.py  | 226 ++++++++++++++++++
 8 files changed, 290 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index 94e78defbbee5..48b98dafc7bb5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -166,9 +166,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
+REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<int>,
+                       ops::ElementwiseSubNPUKernel<float>,
                        ops::ElementwiseSubNPUKernel<plat::float16>);
 
 REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
+                       ops::ElementwiseSubGradNPUKernel<int>,
                        ops::ElementwiseSubGradNPUKernel<float>,
                        ops::ElementwiseSubGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc
index d5204f5cacfc6..566b265bfdba9 100644
--- a/paddle/fluid/operators/fill_any_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_any_like_op_npu.cc
@@ -63,9 +63,12 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     auto shape = out->dims();
-    const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out},
-                                     {{"dims", framework::vectorize(shape)}});
-    runner.Run(stream);
+    NpuOpRunner runner;
+    runner.SetType("Fill")
+        .AddInput(framework::vectorize(shape))
+        .AddInput(tensor_tmp)
+        .AddOutput(*out)
+        .Run(stream);
   }
 };
 
@@ -75,5 +78,8 @@ class FillAnyLikeNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::FillAnyLikeNPUKernel<int64_t>,
+#endif
                        ops::FillAnyLikeNPUKernel<float>,
                        ops::FillAnyLikeNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index bb6549c111988..d10e94962d6a6 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -26,6 +26,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/framework.pb.h"
 
+DECLARE_string(npu_precision_mode);
+
 namespace paddle {
 namespace operators {
 
@@ -404,6 +406,12 @@ void NpuOpRunner::Run(aclrtStream stream) const {
   VLOG(4) << "attr: " << attr_;
   VLOG(4) << "stream: " << stream;
 
+  if (!FLAGS_npu_precision_mode.empty()) {
+    PADDLE_ENFORCE_NPU_SUCCESS(
+        aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str()));
+    VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode;
+  }
+
   aclError ret = aclopCompileAndExecute(
       op_type_.c_str(), input_descs_.size(), input_descs_.data(),
       input_buffers_.data(), output_descs_.size(), output_descs_.data(),
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 1084eadc55c5b..f8bf46da4a638 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -181,12 +181,37 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
       paddings[i][1] = static_cast<int64_t>(in_dims[i] - size[i] - offsets[i]);
     }
 
+    Tensor tmp_dout;
+    tmp_dout.ShareDataWith(*dout);
+    auto out_dims = dout->dims();
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto decrease_size = decrease_axis.size();
+    if (decrease_size > 0) {
+      if (decrease_size == static_cast<size_t>(in_dims.size())) {
+        out_dims = framework::make_ddim(std::vector<int>(decrease_size, 1));
+      } else {
+        std::vector<int> origin_out_shape(out_dims.size() + decrease_size, -1);
+        for (size_t i = 0; i < decrease_size; ++i) {
+          origin_out_shape[decrease_axis[i]] = 1;
+        }
+        int index = 0;
+        for (size_t i = 0; i < origin_out_shape.size(); ++i) {
+          if (origin_out_shape[i] == -1) {
+            origin_out_shape[i] = out_dims[index];
+            ++index;
+          }
+        }
+        out_dims = framework::make_ddim(origin_out_shape);
+      }
+      tmp_dout.Resize(out_dims);
+    }
+
     dinput->mutable_data<T>(ctx.GetPlace());
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     const auto& runner =
-        NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
+        NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index b97c3106439be..89a829f9490f9 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -121,6 +121,13 @@ PADDLE_DEFINE_EXPORTED_string(
     "If proveided, it will be passed to aclInit().");
 PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1,
                              "set minmum loss scaling value!");
+PADDLE_DEFINE_EXPORTED_string(
+    npu_precision_mode, "",
+    "NPU operator precision mode, options are 'force_fp32', 'force_fp16', "
+    "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and "
+    "'allow_mix_precision'. If you want to use the default mode ("
+    "allow_fp32_to_fp16), set this to empty string. For more details, "
+    "please refer to the documents");
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
index 6faa77b460213..7c8710fd42b36 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -90,6 +90,11 @@ def test_check_output(self):
     #         max_relative_error=0.006,)
 
 
+class TestElementwiseSubOpInt32(TestElementwiseSubOp):
+    def init_dtype(self):
+        self.dtype = np.int32
+
+
 class TestSubtractAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
index a687509e6ae9c..c3074db1aaff6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py
@@ -57,6 +57,12 @@ def init(self):
         self.value = -1
 
 
+class TestFillAnyLikeNPUOpInt64(TestFillAnyLikeNPUOp):
+    def init(self):
+        self.dtype = np.int64
+        self.value = -1
+
+
 class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp):
     def init(self):
         self.dtype = np.float32
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index 5a38f14868bb8..055c3015f82f5 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -301,5 +301,231 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
+class TestSliceOpDecsDim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.set_inputs()
+        self.set_outputs()
+        self.set_attrs()
+
+    def set_inputs(self):
+        self.inputs = {'Input': self.input}
+
+    def set_outputs(self):
+        self.outputs = {'Out': self.out}
+
+    def set_attrs(self):
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(self.place, ['Input'], 'Out')
+
+
+class TestSliceOpDecsDimFp16(TestSliceOpDecsDim):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestSliceOpDecsDim2(TestSliceOpDecsDim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOpDecsDim3(TestSliceOpDecsDim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOpDecsDim4(TestSliceOpDecsDim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype)
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOpDecsDim5(TestSliceOpDecsDim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOpDecsDim6(TestSliceOpDecsDim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOpDecsDimStartsTensor(TestSliceOpDecsDim):
+    def set_inputs(self):
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype='int32')
+        }
+
+    def set_attrs(self):
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+
+class TestSliceOpDecsDimStartsTensorFP16(TestSliceOpDecsDimStartsTensor):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestSliceOpDecsDimStartsTensorStartsAndEndsTensor(TestSliceOpDecsDim):
+    def set_inputs(self):
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype='int64'),
+            "EndsTensor": np.array(
+                self.ends, dtype='int32')
+        }
+
+    def set_attrs(self):
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOpDecsDimStartsTensorStartsAndEndsTensorFP16(
+        TestSliceOpDecsDimStartsTensorStartsAndEndsTensor):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestSliceOpDecsDimStartsListTensor(TestSliceOpDecsDim):
+    def set_inputs(self):
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+
+    def set_attrs(self):
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, -1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+        self.starts_infer = [1, -1, 2]
+
+
+class TestSliceOpDecsDimStartsListTensor2(TestSliceOpDecsDimStartsListTensor):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [-1]
+        self.out = self.input[:, :, :, -1]
+
+        self.starts_infer = [-1]
+
+
+class TestSliceOpDecsDimStartsListTensorFP16(
+        TestSliceOpDecsDimStartsListTensor):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
 if __name__ == '__main__':
     unittest.main()

From c79de7286e4463119639f97143ef1f91cc70d6a9 Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Wed, 29 Sep 2021 14:44:27 +0800
Subject: [PATCH 03/80] [NPU] Add group norm (#35937)

* [NPU] Add group norm

* [NPU] Add group norm

* [NPU] Add group norm

* [NPU] Add group norm

* [NPU] Add group_norm op
---
 paddle/fluid/operators/group_norm_op_npu.cc   | 306 ++++++++++++++++++
 .../unittests/npu/test_group_norm_op_npu.py   | 217 +++++++++++++
 2 files changed, 523 insertions(+)
 create mode 100644 paddle/fluid/operators/group_norm_op_npu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py

diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
new file mode 100644
index 0000000000000..4ef8320cbdecd
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -0,0 +1,306 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/group_norm_op.h"
+#include <vector>
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct GroupNormFunction {
+ public:
+  explicit GroupNormFunction(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {
+    place = ctx.GetPlace();
+    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
+                 .stream();
+  }
+  void ReduceMean(const Tensor* x, Tensor* y, const std::vector<int>& dim,
+                  bool keep_dims = true) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*y},
+                                     {{"axes", dim}, {"keep_dims", keep_dims}});
+    runner.Run(stream);
+  }
+  void ReduceSum(const Tensor* x, Tensor* y, const std::vector<int>& dim,
+                 bool keep_dims = true) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y},
+                                     {{"axes", dim}, {"keep_dims", keep_dims}});
+    runner.Run(stream);
+  }
+  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Div(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+    //  y should be init first
+    const auto& runner =
+        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
+    runner.Run(stream);
+  }
+  void Sqrt(const Tensor* x, Tensor* y) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {});
+    runner.Run(stream);
+  }
+  void Adds(const Tensor* x, float scalar, Tensor* y) {
+    //  y should be init first
+    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  Tensor ReduceMeanToNG(const Tensor* x, const DataLayout& data_layout,
+                        const int64_t N, const int64_t C, const int64_t H,
+                        const int64_t W, const int G) {
+    Tensor y(x->type());
+    // y.mutable_data<T>( {N,G,1}, place );
+    if (data_layout == DataLayout::kNCHW) {
+      y.mutable_data<T>({N, G, 1}, place);
+      //  shape of x is [N, G, C*H*W/G]
+      this->ReduceMean(x, &y, std::vector<int>{2});
+    } else {
+      y.mutable_data<T>({N, 1, G}, place);
+      //  shape of x is [N, C*H*W/G, G]
+      Tensor x_trans(x->type());
+      x_trans.mutable_data<T>({N, G, C * H * W / G}, place);
+      this->Transpose(x, &x_trans, std::vector<int>{0, 2, 1});
+      this->ReduceMean(&x_trans, &y, std::vector<int>{2});
+    }
+    return y;
+  }
+
+ private:
+  platform::Place place;
+  aclrtStream stream;
+  const framework::ExecutionContext& ctx;
+};
+
+template <typename T>
+class GroupNormNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    auto place = ctx.GetPlace();
+    Tensor xnorm(x->type());
+    xnorm.mutable_data<T>(x->dims(), place);
+    GroupNormFunction<T> F(ctx);
+    if (data_layout != DataLayout::kNCHW) {
+      xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]});
+      F.Transpose(x, &xnorm, std::vector<int>{0, 3, 1, 2});
+    } else {
+      TensorCopy(*x, platform::NPUPlace(), &xnorm);
+    }
+    auto N = xnorm.dims()[0];
+    auto C = xnorm.dims()[1];
+    auto H = xnorm.dims()[2];
+    auto W = xnorm.dims()[3];
+    xnorm.Resize({N * groups, C * H * W / groups});
+    std::vector<int> axis = {1};
+    auto reduce_dim = mean->dims();
+
+    mean->mutable_data<T>({N * groups, 1}, place);
+    var->mutable_data<T>({N * groups, 1}, place);
+    y->mutable_data<T>(place);
+    F.ReduceMean(&xnorm, mean, axis);
+
+    F.Sub(&xnorm, mean, &xnorm);
+    Tensor sqr(x->type());
+    sqr.mutable_data<T>(xnorm.dims(), place);
+
+    F.Mul(&xnorm, &xnorm, &sqr);
+    F.ReduceMean(&sqr, var, axis);
+    Tensor std(x->type());
+    std.mutable_data<T>(var->dims(), place);
+    F.Adds(var, epsilon, &std);
+    F.Sqrt(&std, &std);
+    y->Resize(xnorm.dims());
+    F.Div(&xnorm, &std, y);
+    y->Resize({N, C, H, W});
+    if (scale) {
+      Tensor scale_t(scale->type());
+      scale_t.ShareDataWith(*scale);
+      scale_t.Resize({C, 1, 1});
+      F.Mul(y, &scale_t, y);
+    }
+    if (bias) {
+      Tensor bias_t(bias->type());
+      bias_t.ShareDataWith(*bias);
+      bias_t.Resize({C, 1, 1});
+      F.Add(y, &bias_t, y);
+    }
+    if (data_layout != DataLayout::kNCHW) {
+      F.Transpose(y, y, std::vector<int>{0, 2, 3, 1});
+      y->Resize({x->dims()});
+    }
+    mean->Resize(reduce_dim);
+    var->Resize(reduce_dim);
+  }
+};
+
+template <typename T>
+class GroupNormGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* var = ctx.Input<Tensor>("Variance");
+
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto G = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    GroupNormFunction<T> F(ctx);
+    auto place = ctx.GetPlace();
+    auto _type = y->type();
+
+    Tensor xnorm(_type);
+    xnorm.mutable_data<T>(y->dims(), place);
+    Tensor scale_share(_type);
+    scale_share.ShareDataWith(*scale);
+    Tensor bias_share(_type);
+    bias_share.ShareDataWith(*bias);
+
+    int64_t N = y->dims()[0];
+    int64_t C, H, W;
+    framework::DDim scale_bias_dim;
+    if (data_layout == DataLayout::kNCHW) {
+      C = y->dims()[1];
+      H = y->dims()[2];
+      W = y->dims()[3];
+      scale_bias_dim = framework::make_ddim({C, 1, 1});
+    } else {
+      C = y->dims()[3];
+      H = y->dims()[1];
+      W = y->dims()[2];
+      scale_bias_dim = framework::make_ddim({1, 1, C});
+    }
+    scale_share.Resize(scale_bias_dim);
+    bias_share.Resize(scale_bias_dim);
+    F.Sub(y, &bias_share, &xnorm);
+    F.DivNoNan(&xnorm, &scale_share, &xnorm);
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(place);
+      if (data_layout == DataLayout::kNCHW) {
+        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 2, 3}, false);
+      } else {
+        F.ReduceSum(d_y, d_bias, std::vector<int>{0, 1, 2}, false);
+      }
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(place);
+      Tensor dy_xnorm(_type);
+      dy_xnorm.mutable_data<T>(d_y->dims(), place);
+      F.Mul(d_y, &xnorm, &dy_xnorm);
+      if (data_layout == DataLayout::kNCHW) {
+        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 2, 3});
+      } else {
+        F.ReduceSum(&dy_xnorm, d_scale, std::vector<int>{0, 1, 2});
+      }
+    }
+
+    //  std = Sqrt(var+epsilon), init shape = [ N, G ]
+    Tensor std(_type);
+    std.mutable_data<T>(var->dims(), place);
+    F.Adds(var, epsilon, &std);
+    F.Sqrt(&std, &std);
+    //  d_xnorm_std = dy_proc * scale / std
+    Tensor d_xnorm_std(_type);
+    d_xnorm_std.mutable_data<T>(y->dims(), place);
+    F.Mul(d_y, &scale_share, &d_xnorm_std);
+    if (data_layout == DataLayout::kNCHW) {
+      xnorm.Resize({N, G, C * H * W / G});
+      d_xnorm_std.Resize({N, G, C * H * W / G});
+      std.Resize({N, G, 1});
+    } else {
+      xnorm.Resize({N, C * H * W / G, G});
+      d_xnorm_std.Resize({N, C * H * W / G, G});
+      std.Resize({N, 1, G});
+    }
+    F.Div(&d_xnorm_std, &std, &d_xnorm_std);
+
+    //  d_x = d_xnorm_std
+    //       - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm
+    //       - Mean ( d_xnorm_std, axis=1, keepdim=True )
+    d_x->mutable_data<T>(place);
+    d_x->Resize(xnorm.dims());
+    F.Mul(&d_xnorm_std, &xnorm, d_x);
+    Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G);
+    F.Mul(&dx1, &xnorm, d_x);
+
+    Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G);
+
+    F.Sub(&d_xnorm_std, d_x, d_x);
+    F.Sub(d_x, &dx2, d_x);
+
+    d_x->Resize(y->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(group_norm, ops::GroupNormNPUKernel<float>,
+                       ops::GroupNormNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(group_norm_grad, ops::GroupNormGradNPUKernel<float>,
+                       ops::GroupNormGradNPUKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py
new file mode 100644
index 0000000000000..9ab1161be36dd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py
@@ -0,0 +1,217 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+import sys
+sys.path.append("..")
+
+from operator import mul
+from op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+def group_norm_naive(x, scale, bias, epsilon, groups, data_layout):
+    if data_layout == "NHWC":
+        x = np.transpose(x, (0, 3, 1, 2))  # NHWC => NCHW
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    xnorm = (x - mean) / np.sqrt(var + epsilon)
+    xnorm = xnorm.reshape((N, C, H, W))
+    output = xnorm * scale.reshape((-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    if data_layout == "NHWC":
+        output = np.transpose(output, (0, 2, 3, 1))  # NCHW => NHWC
+        xnorm = np.transpose(xnorm, (0, 2, 3, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+
+
+class TestGroupNormOpError(unittest.TestCase):
+    def test_errors(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+
+            def test_x_type():
+                input = np.random.random(2, 100, 3, 5).astype('float32')
+                groups = 2
+                fluid.layers.group_norm(input, groups)
+
+            self.assertRaises(TypeError, test_x_type)
+
+            def test_x_dtype():
+                x2 = fluid.layers.data(
+                    name='x2', shape=[2, 100, 3, 5], dtype='int32')
+                groups = 2
+                fluid.layers.group_norm(x2, groups)
+
+            self.assertRaises(TypeError, test_x_dtype)
+
+
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = 'group_norm'
+        self.place = paddle.NPUPlace(0)
+
+        self.init_dtype()
+
+        self.data_format = "NCHW"
+        self.atol = 1e-6
+        self.max_relative_error = 0.005
+        self.shape = (2, 100, 3, 5)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2, 'data_layout': "NCHW"}
+        self.compare_between_place = False
+        self.init_test_case()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        if self.data_format == "NHWC":
+            input = np.transpose(input, (0, 2, 3, 1))
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'],
+            self.data_format)
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+        self.attrs['data_layout'] = self.data_format
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=self.atol)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+
+        self.__class__.exist_check_grad = True
+        inputs_to_check = ['X', 'Scale', 'Bias']
+        output_names = 'Y'
+        no_grad_set = set()
+        cpu_place = fluid.CPUPlace()
+        cpu_grads = self._get_gradient(inputs_to_check, cpu_place, output_names,
+                                       no_grad_set)
+        npu_grads = self._get_gradient(inputs_to_check, self.place,
+                                       output_names, no_grad_set)
+
+        self._assert_is_close(cpu_grads, npu_grads, inputs_to_check,
+                              self.max_relative_error,
+                              "Gradient Check between places")
+
+    def init_test_case(self):
+        pass
+
+
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+
+
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+
+
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOp1_With_NHWC(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.data_format = "NHWC"
+
+
+class TestGroupNormOp2_With_NHWC(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.data_format = "NHWC"
+
+
+class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+        self.data_format = "NHWC"
+
+
+class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+        self.data_format = "NHWC"
+
+
+class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+        self.data_format = "NHWC"
+
+
+class TestGroupNormOpFP16(TestGroupNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestGroupNormOpFP16_With_NHWC(TestGroupNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def init_test_case(self):
+        self.data_format = "NHWC"
+
+
+class TestGroupNormException(unittest.TestCase):
+    # data_layout is not NHWC or NCHW
+    def test_exception(self):
+        data = fluid.data(name='data', shape=[None, 3, 3, 4], dtype="float64")
+
+        def attr_data_format():
+            out = fluid.layers.group_norm(
+                input=data, groups=2, data_layout="NDHW")
+
+        self.assertRaises(ValueError, attr_data_format)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 2b8fd704d0ec555b5b27d50fca261741a7fbbf28 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 29 Sep 2021 14:50:43 +0800
Subject: [PATCH 04/80] fix bug of top_k npu op (#36175)

---
 paddle/fluid/operators/top_k_op_npu.cc        |  4 ++-
 .../tests/unittests/npu/test_top_k_op_npu.py  | 36 +++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index ca3a5f957685d..a7d8fe01edd4c 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -51,7 +51,9 @@ class TopkNPUKernel : public framework::OpKernel<T> {
     indices->mutable_data<int64_t>(ctx.GetPlace());
 
     // prepare assit
-    auto dim = input->dims().size();
+    auto size = input->dims().size();
+    // dim is the last dimension of input
+    auto dim = input->dims()[size - 1];
     framework::Tensor assist_seq_tensor;
     assist_seq_tensor.Resize({2 * dim});
     assist_seq_tensor.mutable_data<T>(ctx.GetPlace());
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
index b735adf76d6c1..c8a620d9dbb35 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -22,6 +22,7 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
+from test_top_k_v2_op_npu import numpy_topk
 
 paddle.enable_static()
 SEED = 2021
@@ -87,5 +88,40 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
 
+class TestTopkV3(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "top_k"
+
+        self.init_dtype()
+        self.set_input_data()
+        self.set_attrs()
+        output, indices = numpy_topk(
+            self.input_data, axis=self.axis, k=self.k, largest=True)
+
+        self.inputs = {'X': self.input_data}
+        self.attrs = {'k': self.k, 'axis': self.axis}
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_attrs(self):
+        self.k = 3
+        self.axis = 1
+
+    def set_input_data(self):
+        self.input_data = np.random.choice(
+            10000, size=(10, 20), replace=False).astype(self.dtype)
+
+
 if __name__ == '__main__':
     unittest.main()

From 83578cfad12bf1925171c1501cea2bef4a679d3f Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Wed, 29 Sep 2021 14:52:05 +0800
Subject: [PATCH 05/80] [npu] add box coder (#36171)

* [npu] add box coder

* [npu] add box coder
---
 .../fluid/operators/detection/CMakeLists.txt  |   7 +-
 .../operators/detection/box_coder_op_npu.cc   | 373 ++++++++++++++++++
 .../unittests/npu/test_box_coder_op_npu.py    | 252 ++++++++++++
 3 files changed, 631 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/detection/box_coder_op_npu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py

diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c04d04f841388..4e951f6318cc9 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -15,8 +15,13 @@ function(detection_library TARGET_NAME)
         PARENT_SCOPE)
 endfunction()
 
+if (WITH_ASCEND_CL)
+    detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc)
+else()
+    detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
+endif()
+
 detection_library(bipartite_match_op SRCS bipartite_match_op.cc)
-detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu)
 detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc)
 detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu)
 detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu)
diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc
new file mode 100644
index 0000000000000..9d97c7af9630c
--- /dev/null
+++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_coder_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct BoxCoderFunction {
+ public:
+  explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
+    place = ctx.GetPlace();
+    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
+                 .stream();
+  }
+  Tensor Adds(const Tensor& x, float scalar) {
+    Tensor y;
+    y.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}});
+    runner.Run(stream);
+    return y;
+  }
+  Tensor Muls(const Tensor& x, float scalar) {
+    Tensor y;
+    y.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}});
+    runner.Run(stream);
+    return y;
+  }
+  Tensor Mul(const Tensor& x, const Tensor& y) {
+    Tensor z;
+    z.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {});
+    runner.Run(stream);
+    return z;
+  }
+  Tensor SubWithBroadCast(const Tensor& x, const Tensor& y,
+                          const framework::DDim& shape) {
+    Tensor z;
+    z.mutable_data<T>(shape, place);
+    const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {});
+    runner.Run(stream);
+    return z;
+  }
+  void DivWithBroadCastVoid(const Tensor& x, const Tensor& y,
+                            const framework::DDim& shape, Tensor* z) {
+    z->mutable_data<T>(shape, place);
+    const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {});
+    runner.Run(stream);
+  }
+  Tensor DivWithBroadCast(const Tensor& x, const Tensor& y,
+                          const framework::DDim& shape) {
+    Tensor z;
+    DivWithBroadCastVoid(x, y, shape, &z);
+    return z;
+  }
+  void MulWithBroadCastVoid(const Tensor& x, const Tensor& y,
+                            const framework::DDim& shape, Tensor* z) {
+    z->mutable_data<T>(shape, place);
+    const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {});
+    runner.Run(stream);
+  }
+  Tensor MulWithBroadCast(const Tensor& x, const Tensor& y,
+                          const framework::DDim& shape) {
+    Tensor z;
+    MulWithBroadCastVoid(x, y, shape, &z);
+    return z;
+  }
+  void AddWithBroadCastVoid(const Tensor& x, const Tensor& y,
+                            const framework::DDim& shape, Tensor* z) {
+    z->mutable_data<T>(shape, place);
+    const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {});
+    runner.Run(stream);
+  }
+  Tensor AddWithBroadCast(const Tensor& x, const Tensor& y,
+                          const framework::DDim& shape) {
+    Tensor z;
+    AddWithBroadCastVoid(x, y, shape, &z);
+    return z;
+  }
+  Tensor Abs(const Tensor& x) {
+    Tensor y;
+    y.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Abs", {x}, {y}, {});
+    runner.Run(stream);
+    return y;
+  }
+  Tensor Log(const Tensor& x) {
+    Tensor t_x_m1 = Adds(x, -1);
+    Tensor y;
+    y.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {});
+    runner.Run(stream);
+    return y;
+  }
+  Tensor Exp(const Tensor& x) {
+    Tensor y;
+    y.mutable_data<T>(x.dims(), place);
+    const auto& runner = NpuOpRunner("Exp", {x}, {y}, {});
+    runner.Run(stream);
+    return y;
+  }
+  Tensor Dot(const Tensor& x, const Tensor& y) {
+    auto dim_x = x.dims();
+    auto dim_y = y.dims();
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 2,
+        platform::errors::InvalidArgument(
+            "x should be a 2-dim tensor, but got %d-dim.", dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_y.size(), 2,
+        platform::errors::InvalidArgument(
+            "y should be a 2-dim tensor, but got %d-dim.", dim_y.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], dim_y[0],
+        platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but "
+                                          "got dim_x[1] = %d, dim_y[0] = %d.",
+                                          dim_x[1], dim_y[0]));
+    Tensor z;
+    z.mutable_data<T>({dim_x[0], dim_y[1]}, place);
+    const auto& runner =
+        NpuOpRunner("MatMul", {x, y}, {z},
+                    {{"transpose_x1", false}, {"transpose_x2", false}});
+    runner.Run(stream);
+    return z;
+  }
+  void ConcatVoid(const std::vector<Tensor>& inputs,
+                  const framework::DDim& shape_out, int axis, Tensor* output) {
+    output->mutable_data<T>(shape_out, place);
+    std::vector<std::string> names;
+    for (size_t i = 0; i < inputs.size(); i++) {
+      names.push_back("x" + std::to_string(i));
+    }
+    NpuOpRunner runner{
+        "ConcatD",
+        {inputs},
+        {*output},
+        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
+    runner.AddInputNames(names);
+    runner.Run(stream);
+  }
+  Tensor Concat(const std::vector<Tensor>& inputs,
+                const framework::DDim& shape_out, int axis) {
+    Tensor output;
+    ConcatVoid(inputs, shape_out, axis, &output);
+    return output;
+  }
+  Tensor Slice(const Tensor& x, const std::vector<int>& offsets,
+               const std::vector<int>& size, const framework::DDim& shape) {
+    Tensor y;
+    y.mutable_data<T>(shape, place);
+    const auto& runner =
+        NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}});
+    runner.Run(stream);
+    return y;
+  }
+
+ private:
+  platform::Place place;
+  aclrtStream stream;
+  const framework::ExecutionContext& ctx;
+};
+
+template <typename T>
+void Vector2Tensor(const framework::ExecutionContext& ctx,
+                   const std::vector<T>& vec, const framework::DDim& ddim,
+                   Tensor* tsr) {
+  framework::TensorFromVector<T>(vec, ctx.device_context(), tsr);
+  ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+  tsr->Resize(ddim);
+}
+
+template <typename T>
+void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb,
+                 const Tensor* pb, const Tensor* pbv, const bool norm,
+                 const std::vector<float>& variance, Tensor* out) {
+  auto M = pb->dims()[0];
+  auto N = tb->dims()[0];
+  auto shape_0 = framework::make_ddim({4, 2});
+  Tensor m_diff;
+  Tensor m_aver;
+  std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
+                             static_cast<T>(0),  static_cast<T>(-1),
+                             static_cast<T>(1),  static_cast<T>(0),
+                             static_cast<T>(0),  static_cast<T>(1)};
+  std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
+                             static_cast<T>(0),   static_cast<T>(0.5),
+                             static_cast<T>(0.5), static_cast<T>(0),
+                             static_cast<T>(0),   static_cast<T>(0.5)};
+  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
+  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
+
+  BoxCoderFunction<T> F(ctx);
+  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  Tensor tb_xy = F.Dot(*tb, m_aver);
+  Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1));
+
+  pb_xy.Resize({1, M, 2});
+  pb_wh.Resize({1, M, 2});
+  tb_xy.Resize({N, 1, 2});
+  tb_wh.Resize({N, 1, 2});
+
+  auto shape_half = framework::make_ddim({N, M, 2});
+  auto shape_full = framework::make_ddim({N, M, 4});
+
+  Tensor out_xy_0 = F.DivWithBroadCast(
+      F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half);
+  Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half)));
+  Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2);
+
+  if (pbv) {
+    F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out);
+  } else {
+    Tensor t_var;
+    std::vector<T> vec_var(4);
+    for (auto i = 0; i < 4; i++) {
+      vec_var[i] = static_cast<T>(variance[i]);
+    }
+    Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var);
+    F.DivWithBroadCastVoid(out_0, t_var, shape_full, out);
+  }
+}
+
+template <typename T>
+void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb,
+                 const Tensor* pb, const Tensor* pbv, const bool norm,
+                 const std::vector<float>& variance, int axis, Tensor* out) {
+  auto shape_0 = framework::make_ddim({4, 2});
+  Tensor m_diff;
+  Tensor m_aver;
+  std::vector<T> vec_diff = {static_cast<T>(-1), static_cast<T>(0),
+                             static_cast<T>(0),  static_cast<T>(-1),
+                             static_cast<T>(1),  static_cast<T>(0),
+                             static_cast<T>(0),  static_cast<T>(1)};
+  std::vector<T> vec_aver = {static_cast<T>(0.5), static_cast<T>(0),
+                             static_cast<T>(0),   static_cast<T>(0.5),
+                             static_cast<T>(0.5), static_cast<T>(0),
+                             static_cast<T>(0),   static_cast<T>(0.5)};
+  Vector2Tensor<T>(ctx, vec_diff, shape_0, &m_diff);
+  Vector2Tensor<T>(ctx, vec_aver, shape_0, &m_aver);
+
+  BoxCoderFunction<T> F(ctx);
+  Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5));
+  Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1));
+  auto pb_resize_shape = axis == 0
+                             ? framework::make_ddim({1, pb->dims()[0], 2})
+                             : framework::make_ddim({pb->dims()[0], 1, 2});
+  pb_xy.Resize(pb_resize_shape);
+  pb_wh.Resize(pb_resize_shape);
+
+  auto tbox_slice_shape =
+      framework::make_ddim({tb->dims()[0], tb->dims()[1], 2});
+  std::vector<int> tbox_slice_size = {static_cast<int>(tb->dims()[0]),
+                                      static_cast<int>(tb->dims()[1]), 2};
+  Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape);
+  Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape);
+
+  Tensor tb_xy;
+  Tensor tb_wh;
+  if (pbv) {
+    auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2});
+    auto pbvt_resize_shape = axis == 0
+                                 ? framework::make_ddim({1, pbv->dims()[0], 2})
+                                 : framework::make_ddim({pbv->dims()[0], 1, 2});
+    std::vector<int> pbvt_slice_size = {static_cast<int>(pbv->dims()[0]), 2};
+    Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape);
+    Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape);
+    pbv_t01.Resize(pbvt_resize_shape);
+    pbv_t23.Resize(pbvt_resize_shape);
+
+    F.AddWithBroadCastVoid(
+        F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape),
+        pb_xy, tbox_slice_shape, &tb_xy);
+    F.MulWithBroadCastVoid(
+        F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh,
+        tbox_slice_shape, &tb_wh);
+  } else if (variance.empty()) {
+    F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape),
+                           pb_xy, tbox_slice_shape, &tb_xy);
+    F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh);
+  } else {
+    Tensor t_var01, t_var23;
+    auto t_var_shape = framework::make_ddim({1, 1, 2});
+    std::vector<T> vec_var01 = {static_cast<T>(variance[0]),
+                                static_cast<T>(variance[1])};
+    std::vector<T> vec_var23 = {static_cast<T>(variance[2]),
+                                static_cast<T>(variance[3])};
+    Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01);
+    Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23);
+    F.AddWithBroadCastVoid(
+        F.MulWithBroadCast(tbox01,
+                           F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape),
+                           tbox_slice_shape),
+        pb_xy, tbox_slice_shape, &tb_xy);
+    F.MulWithBroadCastVoid(
+        F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh,
+        tbox_slice_shape, &tb_wh);
+  }
+  Tensor obox01 =
+      F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape);
+  Tensor obox23 =
+      F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape),
+             (norm ? 0 : -1));
+  F.ConcatVoid({obox01, obox23}, out->dims(), 2, out);
+}
+
+template <typename T>
+class BoxCoderNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* prior_box = ctx.Input<Tensor>("PriorBox");
+    auto* prior_box_var = ctx.Input<Tensor>("PriorBoxVar");
+    auto* target_box = ctx.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = ctx.Output<Tensor>("OutputBox");
+    std::vector<float> variance = ctx.Attr<std::vector<float>>("variance");
+    const int axis = ctx.Attr<int>("axis");
+
+    if (prior_box_var) {
+      PADDLE_ENFORCE_EQ(variance.empty(), true,
+                        platform::errors::InvalidArgument(
+                            "Input 'PriorBoxVar' and attribute 'variance'"
+                            " of BoxCoder operator should not be used at the "
+                            "same time."));
+    }
+    if (!(variance.empty())) {
+      PADDLE_ENFORCE_EQ(static_cast<int>(variance.size()), 4,
+                        platform::errors::InvalidArgument(
+                            "Size of attribute 'variance' in BoxCoder operator"
+                            " should be 4. But received size is %d",
+                            variance.size()));
+    }
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        platform::errors::InvalidArgument(
+                            "Input 'TargetBox' of BoxCoder operator only"
+                            " supports LoD with one level."));
+    }
+
+    auto code_type = GetBoxCodeType(ctx.Attr<std::string>("code_type"));
+    bool normalized = ctx.Attr<bool>("box_normalized");
+
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      BoxCoderEnc<T>(ctx, target_box, prior_box, prior_box_var, normalized,
+                     variance, output_box);
+    } else {
+      BoxCoderDec<T>(ctx, target_box, prior_box, prior_box_var, normalized,
+                     variance, axis, output_box);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel<float>,
+                       ops::BoxCoderNPUKernel<plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
new file mode 100644
index 0000000000000..4d4d61ace841e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+import math
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+np.random.seed(2021)
+
+
+def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+
+    shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1)
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else (
+            pb_v.shape[0], 1, pb_v.shape[1])
+        pb_v = pb_v.reshape(var_shape)
+    if pb_v.ndim == 1:
+        tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h
+    else:
+        tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x
+        tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y
+        tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w
+        tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h
+    output_box[:, :, 0] = tb_x - tb_w / 2
+    output_box[:, :, 1] = tb_y - tb_h / 2
+    output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm)
+    output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm)
+
+
+def box_encoder(t_box, p_box, pb_v, output_box, norm):
+    pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False)
+    pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False)
+    pb_x = pb_w * 0.5 + p_box[:, 0]
+    pb_y = pb_h * 0.5 + p_box[:, 1]
+    shape = (1, p_box.shape[0])
+
+    pb_w = pb_w.reshape(shape)
+    pb_h = pb_h.reshape(shape)
+    pb_x = pb_x.reshape(shape)
+    pb_y = pb_y.reshape(shape)
+
+    if pb_v.ndim == 2:
+        pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1])
+    tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1)
+    tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1)
+    tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm)
+    tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm)
+    if pb_v.ndim == 1:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3]
+    else:
+        output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0]
+        output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1]
+        output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2]
+        output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3]
+
+
+def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0):
+    n = t_box.shape[0]
+    m = p_box.shape[0]
+    if code_type == "decode_center_size":
+        m = t_box.shape[1]
+    output_box = np.zeros((n, m, 4), dtype=np.float32)
+    cur_offset = 0
+
+    for i in range(len(lod)):
+        if (code_type == "encode_center_size"):
+            box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v,
+                        output_box[cur_offset:(cur_offset + lod[i]), :, :],
+                        norm)
+        elif (code_type == "decode_center_size"):
+            box_decoder(t_box, p_box, pb_v, output_box, norm, axis)
+        cur_offset += lod[i]
+    return output_box
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestBoxCoderOp(OpTest):
+    def setUp(self):
+        self.op_type = "box_coder"
+        self.set_npu()
+        self.init_dtype()
+
+        self.set_init_config()
+        self.set_inputs()
+        self.set_attrs()
+        self.set_outputs()
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_init_config(self):
+        self.M = 81
+        self.N = 20
+        self.code_type = 'decode_center_size'
+        self.box_normalized = False
+        self.lod = [[1, 1, 1, 1, 1]]
+        self.axis = 0
+        self.use_variance = False
+        self.without_prior_box_var = False
+        self.atol = 1e-5
+
+    def set_inputs(self):
+        self.inputs = {}
+        assert (self.code_type in ['decode_center_size', 'encode_center_size'])
+        assert (self.axis in [0, 1])
+        if self.code_type == 'decode_center_size':
+            assert (not self.use_variance or not self.without_prior_box_var)
+
+            self.prior_box = np.random.random((self.M, 4)).astype(self.dtype)
+
+            if self.use_variance:
+                self.prior_box_var = np.random.random(4).astype(self.dtype)
+            else:
+                if self.without_prior_box_var:
+                    self.prior_box_var = np.ones((self.M, 4)).astype(self.dtype)
+                else:
+                    self.prior_box_var = np.random.random(
+                        (self.M, 4)).astype(self.dtype)
+
+            if self.axis == 0:
+                self.target_box = np.random.random(
+                    (self.N, self.M, 4)).astype(self.dtype)
+            else:
+                self.target_box = np.random.random(
+                    (self.M, self.N, 4)).astype(self.dtype)
+            self.inputs['PriorBox'] = self.prior_box
+            self.inputs['TargetBox'] = self.target_box
+            if (not self.use_variance and not self.without_prior_box_var):
+                self.inputs['PriorBoxVar'] = self.prior_box_var
+        else:
+            #encode_center_size
+            self.prior_box = np.random.random((self.M, 4)).astype(self.dtype)
+            if self.use_variance:
+                self.prior_box_var = np.random.random(4).astype(self.dtype)
+            else:
+                self.prior_box_var = np.random.random(
+                    (self.M, 4)).astype(self.dtype)
+            self.target_box = np.random.random((self.N, 4)).astype(self.dtype)
+            self.inputs['PriorBox'] = self.prior_box
+            #self.inputs['PriorBoxVar'] = self.prior_box_var
+            self.inputs['TargetBox'] = (self.target_box, self.lod)
+            if (not self.use_variance):
+                self.inputs['PriorBoxVar'] = self.prior_box_var
+
+    def set_attrs(self):
+        self.attrs = {
+            'code_type': self.code_type,
+            'box_normalized': self.box_normalized
+        }
+        if self.use_variance:
+            self.attrs['variance'] = self.prior_box_var.astype(
+                np.float).flatten()
+        if self.axis != 0:
+            self.attrs['axis'] = self.axis
+
+    def set_outputs(self):
+        output_box = batch_box_coder(
+            self.prior_box, self.prior_box_var, self.target_box, self.lod[0],
+            self.code_type, self.box_normalized, self.axis)
+        self.outputs = {'OutputBox': output_box.astype(self.dtype)}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=self.atol)
+
+
+class TestBoxCoderOpWithoutBoxVar(TestBoxCoderOp):
+    def set_init_config(self):
+        super(TestBoxCoderOpWithoutBoxVar, self).set_init_config()
+        self.without_prior_box_var = True
+        self.lod = [[0, 1, 2, 3, 4, 5]]
+
+
+class TestBoxCoderOpWithLoD(TestBoxCoderOp):
+    def set_init_config(self):
+        super(TestBoxCoderOpWithLoD, self).set_init_config()
+        self.M = 20
+        self.N = 50
+        self.lod = [[10, 20, 20]]
+        self.code_type = 'encode_center_size'
+        self.box_normalized = True
+
+
+class TestBoxCoderOpWithLoDWithVariance(TestBoxCoderOpWithLoD):
+    def set_init_config(self):
+        super(TestBoxCoderOpWithLoDWithVariance, self).set_init_config()
+        self.use_variance = True
+
+
+class TestBoxCoderOpWithAxis(TestBoxCoderOp):
+    def set_init_config(self):
+        super(TestBoxCoderOpWithAxis, self).set_init_config()
+        self.axis = 1
+
+
+class TestBoxCoderOpWithVariance(TestBoxCoderOp):
+    def set_init_config(self):
+        super(TestBoxCoderOpWithVariance, self).set_init_config()
+        self.use_variance = True
+
+
+class TestBoxCoderOpFP16(TestBoxCoderOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_init_config(self):
+        super(TestBoxCoderOpFP16, self).set_init_config()
+        self.atol = 1e-2
+
+
+if __name__ == '__main__':
+    unittest.main()

From 79bd5f90f304c239f2b51778c977648016174381 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Wed, 29 Sep 2021 14:59:53 +0800
Subject: [PATCH 06/80] add slot record dataset (#36200)

---
 paddle/fluid/framework/channel.h          |  20 +-
 paddle/fluid/framework/data_feed.cc       | 112 +++++++-
 paddle/fluid/framework/data_feed.h        | 317 +++++++++++++++++++++-
 paddle/fluid/framework/data_set.cc        | 166 +++++++++--
 paddle/fluid/framework/data_set.h         |  40 ++-
 paddle/fluid/framework/dataset_factory.cc |   3 +-
 paddle/fluid/platform/flags.cc            |   8 +
 paddle/fluid/pybind/data_set_py.cc        |   2 -
 8 files changed, 622 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
index 503f1513aad20..80fee94f1c85d 100644
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
@@ -157,7 +157,19 @@ class ChannelObject {
     p.resize(finished);
     return finished;
   }
+  // read once only
+  size_t ReadOnce(std::vector<T>& p, size_t size) {  // NOLINT
+    if (size == 0) {
+      return 0;
+    }
+    std::unique_lock<std::mutex> lock(mutex_);
+    p.resize(size);
+    size_t finished = Read(size, &p[0], lock, true);
+    p.resize(finished);
+    Notify();
 
+    return finished;
+  }
   size_t ReadAll(std::vector<T>& p) {  // NOLINT
     p.clear();
     size_t finished = 0;
@@ -241,17 +253,21 @@ class ChannelObject {
     return !closed_;
   }
 
-  size_t Read(size_t n, T* p, std::unique_lock<std::mutex>& lock) {  // NOLINT
+  size_t Read(size_t n, T* p, std::unique_lock<std::mutex>& lock,  // NOLINT
+              bool once = false) {                                 // NOLINT
     size_t finished = 0;
     CHECK(n <= MaxCapacity() - reading_count_);
     reading_count_ += n;
     while (finished < n && WaitForRead(lock)) {
-      size_t m = std::min(n - finished, data_.size());
+      size_t m = (std::min)(n - finished, data_.size());
       for (size_t i = 0; i < m; i++) {
         p[finished++] = std::move(data_.front());
         data_.pop_front();
       }
       reading_count_ -= m;
+      if (once && m > 0) {
+        break;
+      }
     }
     reading_count_ -= n - finished;
     return finished;
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index fdb24ee18eca7..4463fd9fd5340 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -36,6 +36,107 @@ DLManager& global_dlmanager_pool() {
   return manager;
 }
 
+class BufferedLineFileReader {
+  typedef std::function<bool()> SampleFunc;
+  static const int MAX_FILE_BUFF_SIZE = 4 * 1024 * 1024;
+  class FILEReader {
+   public:
+    explicit FILEReader(FILE* fp) : fp_(fp) {}
+    int read(char* buf, int len) { return fread(buf, sizeof(char), len, fp_); }
+
+   private:
+    FILE* fp_;
+  };
+
+ public:
+  typedef std::function<bool(const std::string&)> LineFunc;
+
+ private:
+  template <typename T>
+  int read_lines(T* reader, LineFunc func, int skip_lines) {
+    int lines = 0;
+    size_t ret = 0;
+    char* ptr = NULL;
+    char* eol = NULL;
+    total_len_ = 0;
+    error_line_ = 0;
+
+    SampleFunc spfunc = get_sample_func();
+    std::string x;
+    while (!is_error() && (ret = reader->read(buff_, MAX_FILE_BUFF_SIZE)) > 0) {
+      total_len_ += ret;
+      ptr = buff_;
+      eol = reinterpret_cast<char*>(memchr(ptr, '\n', ret));
+      while (eol != NULL) {
+        int size = static_cast<int>((eol - ptr) + 1);
+        x.append(ptr, size - 1);
+        ++lines;
+        if (lines > skip_lines && spfunc()) {
+          if (!func(x)) {
+            ++error_line_;
+          }
+        }
+
+        x.clear();
+        ptr += size;
+        ret -= size;
+        eol = reinterpret_cast<char*>(memchr(ptr, '\n', ret));
+      }
+      if (ret > 0) {
+        x.append(ptr, ret);
+      }
+    }
+    if (!is_error() && !x.empty()) {
+      ++lines;
+      if (lines > skip_lines && spfunc()) {
+        if (!func(x)) {
+          ++error_line_;
+        }
+      }
+    }
+    return lines;
+  }
+
+ public:
+  BufferedLineFileReader()
+      : random_engine_(std::random_device()()),
+        uniform_distribution_(0.0f, 1.0f) {
+    total_len_ = 0;
+    sample_line_ = 0;
+    buff_ =
+        reinterpret_cast<char*>(calloc(MAX_FILE_BUFF_SIZE + 1, sizeof(char)));
+  }
+  ~BufferedLineFileReader() { free(buff_); }
+
+  int read_file(FILE* fp, LineFunc func, int skip_lines) {
+    FILEReader reader(fp);
+    return read_lines<FILEReader>(&reader, func, skip_lines);
+  }
+  uint64_t file_size(void) { return total_len_; }
+  void set_sample_rate(float r) { sample_rate_ = r; }
+  size_t get_sample_line() { return sample_line_; }
+  bool is_error(void) { return (error_line_ > 10); }
+
+ private:
+  SampleFunc get_sample_func() {
+    if (std::abs(sample_rate_ - 1.0f) < 1e-5f) {
+      return [this](void) { return true; };
+    }
+    return [this](void) {
+      return (uniform_distribution_(random_engine_) < sample_rate_);
+    };
+  }
+
+ private:
+  char* buff_ = nullptr;
+  uint64_t total_len_ = 0;
+
+  std::default_random_engine random_engine_;
+  std::uniform_real_distribution<float> uniform_distribution_;
+  float sample_rate_ = 1.0f;
+  size_t sample_line_ = 0;
+  size_t error_line_ = 0;
+};
 void RecordCandidateList::ReSize(size_t length) {
   mutex_.lock();
   capacity_ = length;
@@ -301,7 +402,7 @@ int InMemoryDataFeed<T>::Next() {
               << ", thread_id=" << thread_id_;
     }
   } else {
-    VLOG(3) << "enable heter NEXT: " << offset_index_
+    VLOG(3) << "enable heter next: " << offset_index_
             << " batch_offsets: " << batch_offsets_.size();
     if (offset_index_ >= batch_offsets_.size()) {
       VLOG(3) << "offset_index: " << offset_index_
@@ -318,14 +419,7 @@ int InMemoryDataFeed<T>::Next() {
       VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
               << thread_id_;
     }
-    /*
-    if (offset_index_ == batch_offsets_.size() - 1) {
-      std::vector<Record> data;
-      output_channel_->ReadAll(data);
-      consume_channel_->Write(std::move(data));
-    }
-    */
-    VLOG(3) << "#15 enable heter NEXT: " << offset_index_
+    VLOG(3) << "enable heter next: " << offset_index_
             << " batch_offsets: " << batch_offsets_.size()
             << " baych_size: " << this->batch_size_;
   }
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 198bc51463af3..5527eaf1f6fa4 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -39,8 +39,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_int32(record_pool_max_size);
+DECLARE_int32(slotpool_thread_num);
+DECLARE_bool(enable_slotpool_wait_release);
+DECLARE_bool(enable_slotrecord_reset_shrink);
+
 namespace paddle {
 namespace framework {
 class DataFeedDesc;
@@ -69,6 +75,50 @@ namespace framework {
 //   while (reader->Next()) {
 //      // trainer do something
 //   }
+
+template <typename T>
+struct SlotValues {
+  std::vector<T> slot_values;
+  std::vector<uint32_t> slot_offsets;
+
+  void add_values(const T* values, uint32_t num) {
+    if (slot_offsets.empty()) {
+      slot_offsets.push_back(0);
+    }
+    if (num > 0) {
+      slot_values.insert(slot_values.end(), values, values + num);
+    }
+    slot_offsets.push_back(static_cast<uint32_t>(slot_values.size()));
+  }
+  T* get_values(int idx, size_t* size) {
+    uint32_t& offset = slot_offsets[idx];
+    (*size) = slot_offsets[idx + 1] - offset;
+    return &slot_values[offset];
+  }
+  void add_slot_feasigns(const std::vector<std::vector<T>>& slot_feasigns,
+                         uint32_t fea_num) {
+    slot_values.reserve(fea_num);
+    int slot_num = static_cast<int>(slot_feasigns.size());
+    slot_offsets.resize(slot_num + 1);
+    for (int i = 0; i < slot_num; ++i) {
+      auto& slot_val = slot_feasigns[i];
+      slot_offsets[i] = static_cast<uint32_t>(slot_values.size());
+      uint32_t num = static_cast<uint32_t>(slot_val.size());
+      if (num > 0) {
+        slot_values.insert(slot_values.end(), slot_val.begin(), slot_val.end());
+      }
+    }
+    slot_offsets[slot_num] = slot_values.size();
+  }
+  void clear(bool shrink) {
+    slot_offsets.clear();
+    slot_values.clear();
+    if (shrink) {
+      slot_values.shrink_to_fit();
+      slot_offsets.shrink_to_fit();
+    }
+  }
+};
 union FeatureFeasign {
   uint64_t uint64_feasign_;
   float float_feasign_;
@@ -97,6 +147,38 @@ struct FeatureItem {
   uint16_t slot_;
 };
 
+struct AllSlotInfo {
+  std::string slot;
+  std::string type;
+  int used_idx;
+  int slot_value_idx;
+};
+struct UsedSlotInfo {
+  int idx;
+  int slot_value_idx;
+  std::string slot;
+  std::string type;
+  bool dense;
+  std::vector<int> local_shape;
+  int total_dims_without_inductive;
+  int inductive_shape_index;
+};
+struct SlotRecordObject {
+  uint64_t search_id;
+  uint32_t rank;
+  uint32_t cmatch;
+  std::string ins_id_;
+  SlotValues<uint64_t> slot_uint64_feasigns_;
+  SlotValues<float> slot_float_feasigns_;
+
+  ~SlotRecordObject() { clear(true); }
+  void reset(void) { clear(FLAGS_enable_slotrecord_reset_shrink); }
+  void clear(bool shrink) {
+    slot_uint64_feasigns_.clear(shrink);
+    slot_float_feasigns_.clear(shrink);
+  }
+};
+using SlotRecord = SlotRecordObject*;
 // sizeof Record is much less than std::vector<MultiSlotType>
 struct Record {
   std::vector<FeatureItem> uint64_feasigns_;
@@ -108,6 +190,179 @@ struct Record {
   uint32_t cmatch;
 };
 
+inline SlotRecord make_slotrecord() {
+  static const size_t slot_record_byte_size = sizeof(SlotRecordObject);
+  void* p = malloc(slot_record_byte_size);
+  new (p) SlotRecordObject;
+  return reinterpret_cast<SlotRecordObject*>(p);
+}
+
+inline void free_slotrecord(SlotRecordObject* p) {
+  p->~SlotRecordObject();
+  free(p);
+}
+
+template <class T>
+class SlotObjAllocator {
+ public:
+  explicit SlotObjAllocator(std::function<void(T*)> deleter)
+      : free_nodes_(NULL), capacity_(0), deleter_(deleter) {}
+  ~SlotObjAllocator() { clear(); }
+
+  void clear() {
+    T* tmp = NULL;
+    while (free_nodes_ != NULL) {
+      tmp = reinterpret_cast<T*>(reinterpret_cast<void*>(free_nodes_));
+      free_nodes_ = free_nodes_->next;
+      deleter_(tmp);
+      --capacity_;
+    }
+    CHECK_EQ(capacity_, static_cast<size_t>(0));
+  }
+  T* acquire(void) {
+    T* x = NULL;
+    x = reinterpret_cast<T*>(reinterpret_cast<void*>(free_nodes_));
+    free_nodes_ = free_nodes_->next;
+    --capacity_;
+    return x;
+  }
+  void release(T* x) {
+    Node* node = reinterpret_cast<Node*>(reinterpret_cast<void*>(x));
+    node->next = free_nodes_;
+    free_nodes_ = node;
+    ++capacity_;
+  }
+  size_t capacity(void) { return capacity_; }
+
+ private:
+  struct alignas(T) Node {
+    union {
+      Node* next;
+      char data[sizeof(T)];
+    };
+  };
+  Node* free_nodes_;  // a list
+  size_t capacity_;
+  std::function<void(T*)> deleter_ = nullptr;
+};
+static const int OBJPOOL_BLOCK_SIZE = 10000;
+class SlotObjPool {
+ public:
+  SlotObjPool()
+      : max_capacity_(FLAGS_record_pool_max_size), alloc_(free_slotrecord) {
+    ins_chan_ = MakeChannel<SlotRecord>();
+    ins_chan_->SetBlockSize(OBJPOOL_BLOCK_SIZE);
+    for (int i = 0; i < FLAGS_slotpool_thread_num; ++i) {
+      threads_.push_back(std::thread([this]() { run(); }));
+    }
+    disable_pool_ = false;
+    count_ = 0;
+  }
+  ~SlotObjPool() {
+    ins_chan_->Close();
+    for (auto& t : threads_) {
+      t.join();
+    }
+  }
+  void disable_pool(bool disable) { disable_pool_ = disable; }
+  void set_max_capacity(size_t max_capacity) { max_capacity_ = max_capacity; }
+  void get(std::vector<SlotRecord>* output, int n) {
+    output->resize(n);
+    return get(&(*output)[0], n);
+  }
+  void get(SlotRecord* output, int n) {
+    int size = 0;
+    mutex_.lock();
+    int left = static_cast<int>(alloc_.capacity());
+    if (left > 0) {
+      size = (left >= n) ? n : left;
+      for (int i = 0; i < size; ++i) {
+        output[i] = alloc_.acquire();
+      }
+    }
+    mutex_.unlock();
+    count_ += n;
+    if (size == n) {
+      return;
+    }
+    for (int i = size; i < n; ++i) {
+      output[i] = make_slotrecord();
+    }
+  }
+  void put(std::vector<SlotRecord>* input) {
+    size_t size = input->size();
+    if (size == 0) {
+      return;
+    }
+    put(&(*input)[0], size);
+    input->clear();
+  }
+  void put(SlotRecord* input, size_t size) {
+    CHECK(ins_chan_->WriteMove(size, input) == size);
+  }
+  void run(void) {
+    std::vector<SlotRecord> input;
+    while (ins_chan_->ReadOnce(input, OBJPOOL_BLOCK_SIZE)) {
+      if (input.empty()) {
+        continue;
+      }
+      // over max capacity
+      size_t n = input.size();
+      count_ -= n;
+      if (disable_pool_ || n + capacity() > max_capacity_) {
+        for (auto& t : input) {
+          free_slotrecord(t);
+        }
+      } else {
+        for (auto& t : input) {
+          t->reset();
+        }
+        mutex_.lock();
+        for (auto& t : input) {
+          alloc_.release(t);
+        }
+        mutex_.unlock();
+      }
+      input.clear();
+    }
+  }
+  void clear(void) {
+    platform::Timer timeline;
+    timeline.Start();
+    mutex_.lock();
+    alloc_.clear();
+    mutex_.unlock();
+    // wait release channel data
+    if (FLAGS_enable_slotpool_wait_release) {
+      while (!ins_chan_->Empty()) {
+        sleep(1);
+      }
+    }
+    timeline.Pause();
+    VLOG(3) << "clear slot pool data size=" << count_.load()
+            << ", span=" << timeline.ElapsedSec();
+  }
+  size_t capacity(void) {
+    mutex_.lock();
+    size_t total = alloc_.capacity();
+    mutex_.unlock();
+    return total;
+  }
+
+ private:
+  size_t max_capacity_;
+  Channel<SlotRecord> ins_chan_;
+  std::vector<std::thread> threads_;
+  std::mutex mutex_;
+  SlotObjAllocator<SlotRecordObject> alloc_;
+  bool disable_pool_;
+  std::atomic<long> count_;  // NOLINT
+};
+
+inline SlotObjPool& SlotRecordPool() {
+  static SlotObjPool pool;
+  return pool;
+}
 struct PvInstanceObject {
   std::vector<Record*> ads;
   void merge_instance(Record* ins) { ads.push_back(ins); }
@@ -129,7 +384,21 @@ class CustomParser {
   CustomParser() {}
   virtual ~CustomParser() {}
   virtual void Init(const std::vector<SlotConf>& slots) = 0;
+  virtual bool Init(const std::vector<AllSlotInfo>& slots) = 0;
   virtual void ParseOneInstance(const char* str, Record* instance) = 0;
+  virtual bool ParseOneInstance(
+      const std::string& line,
+      std::function<void(std::vector<SlotRecord>&, int)>
+          GetInsFunc) {  // NOLINT
+    return true;
+  }
+  virtual bool ParseFileInstance(
+      std::function<int(char* buf, int len)> ReadBuffFunc,
+      std::function<void(std::vector<SlotRecord>&, int, int)>
+          PullRecordsFunc,  // NOLINT
+      int& lines) {         // NOLINT
+    return false;
+  }
 };
 
 typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)();
@@ -194,6 +463,34 @@ class DLManager {
     return nullptr;
   }
 
+  paddle::framework::CustomParser* Load(const std::string& name,
+                                        const std::vector<AllSlotInfo>& conf) {
+#ifdef _LINUX
+    std::lock_guard<std::mutex> lock(mutex_);
+    DLHandle handle;
+    std::map<std::string, DLHandle>::iterator it = handle_map_.find(name);
+    if (it != handle_map_.end()) {
+      return it->second.parser;
+    }
+    handle.module = dlopen(name.c_str(), RTLD_NOW);
+    if (handle.module == nullptr) {
+      VLOG(0) << "Create so of " << name << " fail";
+      exit(-1);
+      return nullptr;
+    }
+
+    CreateParserObjectFunc create_parser_func =
+        (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject");
+    handle.parser = create_parser_func();
+    handle.parser->Init(conf);
+    handle_map_.insert({name, handle});
+
+    return handle.parser;
+#endif
+    VLOG(0) << "Not implement in windows";
+    return nullptr;
+  }
+
   paddle::framework::CustomParser* ReLoad(const std::string& name,
                                           const std::vector<SlotConf>& conf) {
     Close(name);
@@ -415,6 +712,11 @@ class InMemoryDataFeed : public DataFeed {
   virtual void SetCurrentPhase(int current_phase);
   virtual void LoadIntoMemory();
   virtual void LoadIntoMemoryFromSo();
+  virtual void SetRecord(T* records) { records_ = records; }
+  int GetDefaultBatchSize() { return default_batch_size_; }
+  void AddBatchOffset(const std::pair<int, int>& offset) {
+    batch_offsets_.push_back(offset);
+  }
 
  protected:
   virtual bool ParseOneInstance(T* instance) = 0;
@@ -424,6 +726,11 @@ class InMemoryDataFeed : public DataFeed {
   virtual void PutToFeedVec(const std::vector<T>& ins_vec) = 0;
   virtual void PutToFeedVec(const T* ins_vec, int num) = 0;
 
+  std::vector<std::vector<float>> batch_float_feasigns_;
+  std::vector<std::vector<uint64_t>> batch_uint64_feasigns_;
+  std::vector<std::vector<size_t>> offset_;
+  std::vector<bool> visit_;
+
   int thread_id_;
   int thread_num_;
   bool parse_ins_id_;
@@ -783,11 +1090,7 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
   MultiSlotInMemoryDataFeed() {}
   virtual ~MultiSlotInMemoryDataFeed() {}
   virtual void Init(const DataFeedDesc& data_feed_desc);
-  void SetRecord(Record* records) { records_ = records; }
-  int GetDefaultBatchSize() { return default_batch_size_; }
-  void AddBatchOffset(const std::pair<int, int>& offset) {
-    batch_offsets_.push_back(offset);
-  }
+  // void SetRecord(Record* records) { records_ = records; }
 
  protected:
   virtual bool ParseOneInstance(Record* instance);
@@ -798,10 +1101,6 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
   virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id,
                                 uint32_t* cmatch, uint32_t* rank);
   virtual void PutToFeedVec(const Record* ins_vec, int num);
-  std::vector<std::vector<float>> batch_float_feasigns_;
-  std::vector<std::vector<uint64_t>> batch_uint64_feasigns_;
-  std::vector<std::vector<size_t>> offset_;
-  std::vector<bool> visit_;
 };
 
 class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 08c42a93d1fcb..82a39b206e6bd 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -351,10 +351,8 @@ static int compute_thread_batch_nccl(
   return thread_avg_batch_num;
 }
 
-template <typename T>
-void DatasetImpl<T>::SetHeterPs(bool enable_heterps) {
+void MultiSlotDataset::PrepareTrain() {
 #ifdef PADDLE_WITH_GLOO
-  enable_heterps_ = enable_heterps;
   if (enable_heterps_) {
     if (input_records_.size() == 0 && input_channel_ != nullptr &&
         input_channel_->Size() != 0) {
@@ -541,22 +539,21 @@ void DatasetImpl<T>::LocalShuffle() {
           << timeline.ElapsedSec() << " seconds";
 }
 
-template <typename T>
-void DatasetImpl<T>::GlobalShuffle(int thread_num) {
+void MultiSlotDataset::GlobalShuffle(int thread_num) {
 #ifdef PADDLE_WITH_PSLIB
-  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() begin";
+  VLOG(3) << "MultiSlotDataset::GlobalShuffle() begin";
   platform::Timer timeline;
   timeline.Start();
   auto fleet_ptr = FleetWrapper::GetInstance();
 
   if (!input_channel_ || input_channel_->Size() == 0) {
-    VLOG(3) << "DatasetImpl<T>::GlobalShuffle() end, no data to shuffle";
+    VLOG(3) << "MultiSlotDataset::GlobalShuffle() end, no data to shuffle";
     return;
   }
 
   // local shuffle
   input_channel_->Close();
-  std::vector<T> data;
+  std::vector<Record> data;
   input_channel_->ReadAll(data);
   std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine());
   input_channel_->Open();
@@ -566,10 +563,10 @@ void DatasetImpl<T>::GlobalShuffle(int thread_num) {
 
   input_channel_->Close();
   input_channel_->SetBlockSize(fleet_send_batch_size_);
-  VLOG(3) << "DatasetImpl<T>::GlobalShuffle() input_channel_ size "
+  VLOG(3) << "MultiSlotDataset::GlobalShuffle() input_channel_ size "
           << input_channel_->Size();
 
-  auto get_client_id = [this, fleet_ptr](const T& data) -> size_t {
+  auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t {
     if (!this->merge_by_insid_) {
       return fleet_ptr->LocalRandomEngine()() % this->trainer_num_;
     } else {
@@ -580,7 +577,7 @@ void DatasetImpl<T>::GlobalShuffle(int thread_num) {
 
   auto global_shuffle_func = [this, get_client_id]() {
     auto fleet_ptr = FleetWrapper::GetInstance();
-    std::vector<T> data;
+    std::vector<Record> data;
     while (this->input_channel_->Read(data)) {
       std::vector<paddle::framework::BinaryArchive> ars(this->trainer_num_);
       for (auto& t : data) {
@@ -835,9 +832,6 @@ void DatasetImpl<T>::CreateReaders() {
       channel_idx = 0;
     }
   }
-  if (enable_heterps_) {
-    SetHeterPs(true);
-  }
   VLOG(3) << "readers size: " << readers_.size();
 }
 
@@ -923,9 +917,8 @@ int64_t DatasetImpl<T>::GetShuffleDataSize() {
   return sum;
 }
 
-template <typename T>
-int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
-                                      const std::string& msg) {
+int MultiSlotDataset::ReceiveFromClient(int msg_type, int client_id,
+                                        const std::string& msg) {
 #ifdef _LINUX
   VLOG(3) << "ReceiveFromClient msg_type=" << msg_type
           << ", client_id=" << client_id << ", msg length=" << msg.length();
@@ -937,9 +930,9 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
   if (ar.Cursor() == ar.Finish()) {
     return 0;
   }
-  std::vector<T> data;
+  std::vector<Record> data;
   while (ar.Cursor() < ar.Finish()) {
-    data.push_back(ar.Get<T>());
+    data.push_back(ar.Get<Record>());
   }
   CHECK(ar.Cursor() == ar.Finish());
 
@@ -966,6 +959,20 @@ int DatasetImpl<T>::ReceiveFromClient(int msg_type, int client_id,
 // explicit instantiation
 template class DatasetImpl<Record>;
 
+void MultiSlotDataset::DynamicAdjustReadersNum(int thread_num) {
+  if (thread_num_ == thread_num) {
+    VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
+            << thread_num_ << ", thread_num_=thread_num, no need to adjust";
+    return;
+  }
+  VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
+  thread_num_ = thread_num;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
+  CreateReaders();
+  VLOG(3) << "adjust readers num done";
+  PrepareTrain();
+}
+
 void MultiSlotDataset::PostprocessInstance() {
   // divide pv instance, and merge to input_channel_
   if (enable_pv_merge_) {
@@ -1503,5 +1510,126 @@ void MultiSlotDataset::SlotsShuffle(
           << ", cost time=" << timeline.ElapsedSec() << " seconds";
 }
 
+template class DatasetImpl<SlotRecord>;
+void SlotRecordDataset::CreateChannel() {
+  if (input_channel_ == nullptr) {
+    input_channel_ = paddle::framework::MakeChannel<SlotRecord>();
+  }
+}
+void SlotRecordDataset::CreateReaders() {
+  VLOG(3) << "Calling CreateReaders()";
+  VLOG(3) << "thread num in Dataset: " << thread_num_;
+  VLOG(3) << "Filelist size in Dataset: " << filelist_.size();
+  VLOG(3) << "channel num in Dataset: " << channel_num_;
+  CHECK(thread_num_ > 0) << "thread num should > 0";
+  CHECK(channel_num_ > 0) << "channel num should > 0";
+  CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num";
+  VLOG(3) << "readers size: " << readers_.size();
+  if (readers_.size() != 0) {
+    VLOG(3) << "readers_.size() = " << readers_.size()
+            << ", will not create again";
+    return;
+  }
+  VLOG(3) << "data feed class name: " << data_feed_desc_.name();
+  for (int i = 0; i < thread_num_; ++i) {
+    readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name()));
+    readers_[i]->Init(data_feed_desc_);
+    readers_[i]->SetThreadId(i);
+    readers_[i]->SetThreadNum(thread_num_);
+    readers_[i]->SetFileListMutex(&mutex_for_pick_file_);
+    readers_[i]->SetFileListIndex(&file_idx_);
+    readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_);
+    readers_[i]->SetFeaNum(&total_fea_num_);
+    readers_[i]->SetFileList(filelist_);
+    readers_[i]->SetParseInsId(parse_ins_id_);
+    readers_[i]->SetParseContent(parse_content_);
+    readers_[i]->SetParseLogKey(parse_logkey_);
+    readers_[i]->SetEnablePvMerge(enable_pv_merge_);
+    readers_[i]->SetCurrentPhase(current_phase_);
+    if (input_channel_ != nullptr) {
+      readers_[i]->SetInputChannel(input_channel_.get());
+    }
+  }
+  VLOG(3) << "readers size: " << readers_.size();
+}
+
+void SlotRecordDataset::ReleaseMemory() {
+  VLOG(3) << "SlotRecordDataset::ReleaseMemory() begin";
+  platform::Timer timeline;
+  timeline.Start();
+
+  if (input_channel_) {
+    input_channel_->Clear();
+    input_channel_ = nullptr;
+  }
+  if (enable_heterps_) {
+    VLOG(3) << "put pool records size: " << input_records_.size();
+    SlotRecordPool().put(&input_records_);
+    input_records_.clear();
+    input_records_.shrink_to_fit();
+    VLOG(3) << "release heterps input records records size: "
+            << input_records_.size();
+  }
+
+  readers_.clear();
+  readers_.shrink_to_fit();
+
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
+
+  VLOG(3) << "SlotRecordDataset::ReleaseMemory() end";
+  VLOG(3) << "total_feasign_num_(" << STAT_GET(STAT_total_feasign_num_in_mem)
+          << ") - current_fea_num_(" << total_fea_num_ << ") = ("
+          << STAT_GET(STAT_total_feasign_num_in_mem) - total_fea_num_ << ")"
+          << " object pool size=" << SlotRecordPool().capacity();  // For Debug
+  STAT_SUB(STAT_total_feasign_num_in_mem, total_fea_num_);
+}
+void SlotRecordDataset::GlobalShuffle(int thread_num) {
+  // TODO(yaoxuefeng)
+  return;
+}
+
+void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num,
+                                                bool discard_remaining_ins) {
+  if (channel_num_ == channel_num) {
+    VLOG(3) << "DatasetImpl<T>::DynamicAdjustChannelNum channel_num_="
+            << channel_num_ << ", channel_num_=channel_num, no need to adjust";
+    return;
+  }
+  VLOG(3) << "adjust channel num from " << channel_num_ << " to "
+          << channel_num;
+  channel_num_ = channel_num;
+
+  if (static_cast<int>(input_channel_->Size()) >= channel_num) {
+    input_channel_->SetBlockSize(input_channel_->Size() / channel_num +
+                                 (discard_remaining_ins ? 0 : 1));
+  }
+
+  VLOG(3) << "adjust channel num done";
+}
+
+void SlotRecordDataset::PrepareTrain() {
+#ifdef PADDLE_WITH_GLOO
+  return;
+#else
+  PADDLE_THROW(platform::errors::Unavailable(
+      "dataset set heterps need compile with GLOO"));
+#endif
+  return;
+}
+
+void SlotRecordDataset::DynamicAdjustReadersNum(int thread_num) {
+  if (thread_num_ == thread_num) {
+    VLOG(3) << "DatasetImpl<T>::DynamicAdjustReadersNum thread_num_="
+            << thread_num_ << ", thread_num_=thread_num, no need to adjust";
+    return;
+  }
+  VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num;
+  thread_num_ = thread_num;
+  std::vector<std::shared_ptr<paddle::framework::DataFeed>>().swap(readers_);
+  CreateReaders();
+  VLOG(3) << "adjust readers num done";
+  PrepareTrain();
+}
+
 }  // end namespace framework
 }  // end namespace paddle
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index f3ee96fab8297..981fb694e0fec 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -149,7 +149,6 @@ class Dataset {
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
   // set fleet send sleep seconds
   virtual void SetFleetSendSleepSeconds(int seconds) = 0;
-  virtual void SetHeterPs(bool enable_heterps) = 0;
 
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
@@ -207,7 +206,7 @@ class DatasetImpl : public Dataset {
   virtual void WaitPreLoadDone();
   virtual void ReleaseMemory();
   virtual void LocalShuffle();
-  virtual void GlobalShuffle(int thread_num = -1);
+  virtual void GlobalShuffle(int thread_num = -1) {}
   virtual void SlotsShuffle(const std::set<std::string>& slots_to_replace) {}
   virtual const std::vector<T>& GetSlotsOriginalData() {
     return slots_shuffle_original_data_;
@@ -233,7 +232,11 @@ class DatasetImpl : public Dataset {
                                        bool discard_remaining_ins = false);
   virtual void DynamicAdjustReadersNum(int thread_num);
   virtual void SetFleetSendSleepSeconds(int seconds);
-  virtual void SetHeterPs(bool enable_heterps);
+  /* for enable_heterps_
+  virtual void EnableHeterps(bool enable_heterps) {
+    enable_heterps_ = enable_heterps;
+  }
+  */
 
   std::vector<paddle::framework::Channel<T>>& GetMultiOutputChannel() {
     return multi_output_channel_;
@@ -251,7 +254,10 @@ class DatasetImpl : public Dataset {
 
  protected:
   virtual int ReceiveFromClient(int msg_type, int client_id,
-                                const std::string& msg);
+                                const std::string& msg) {
+    // TODO(yaoxuefeng) for SlotRecordDataset
+    return -1;
+  }
   std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers_;
   std::vector<std::shared_ptr<paddle::framework::DataFeed>> preload_readers_;
   paddle::framework::Channel<T> input_channel_;
@@ -327,6 +333,32 @@ class MultiSlotDataset : public DatasetImpl<Record> {
       const std::unordered_set<uint16_t>& slots_to_replace,
       std::vector<Record>* result);
   virtual ~MultiSlotDataset() {}
+  virtual void GlobalShuffle(int thread_num = -1);
+  virtual void DynamicAdjustReadersNum(int thread_num);
+  virtual void PrepareTrain();
+
+ protected:
+  virtual int ReceiveFromClient(int msg_type, int client_id,
+                                const std::string& msg);
+};
+class SlotRecordDataset : public DatasetImpl<SlotRecord> {
+ public:
+  SlotRecordDataset() { SlotRecordPool(); }
+  virtual ~SlotRecordDataset() {}
+  // create input channel
+  virtual void CreateChannel();
+  // create readers
+  virtual void CreateReaders();
+  // release memory
+  virtual void ReleaseMemory();
+  virtual void GlobalShuffle(int thread_num = -1);
+  virtual void DynamicAdjustChannelNum(int channel_num,
+                                       bool discard_remaining_ins);
+  virtual void PrepareTrain();
+  virtual void DynamicAdjustReadersNum(int thread_num);
+
+ protected:
+  bool enable_heterps_ = true;
 };
 
 }  // end namespace framework
diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc
index aeaf961185323..38200927c5586 100644
--- a/paddle/fluid/framework/dataset_factory.cc
+++ b/paddle/fluid/framework/dataset_factory.cc
@@ -53,7 +53,7 @@ std::unique_ptr<Dataset> DatasetFactory::CreateDataset(
     std::string dataset_class) {
   if (g_dataset_map.count(dataset_class) < 1) {
     LOG(WARNING) << "Your Dataset " << dataset_class
-                 << "is not supported currently";
+                 << " is not supported currently";
     LOG(WARNING) << "Supported Dataset: " << DatasetTypeList();
     exit(-1);
   }
@@ -61,5 +61,6 @@ std::unique_ptr<Dataset> DatasetFactory::CreateDataset(
 }
 
 REGISTER_DATASET_CLASS(MultiSlotDataset);
+REGISTER_DATASET_CLASS(SlotRecordDataset);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 89a829f9490f9..72b95dcc15346 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -680,3 +680,11 @@ PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
 PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
+
+DEFINE_int32(record_pool_max_size, 2000000,
+             "SlotRecordDataset slot record pool max size");
+DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
+DEFINE_bool(enable_slotpool_wait_release, false,
+            "enable slotrecord obejct wait release, default false");
+DEFINE_bool(enable_slotrecord_reset_shrink, false,
+            "enable slotrecord obejct reset shrink memory, default false");
\ No newline at end of file
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index 41cf0189d3d9d..7a32d8729fc6c 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -309,8 +309,6 @@ void BindDataset(py::module *m) {
            &framework::Dataset::SetFleetSendSleepSeconds,
            py::call_guard<py::gil_scoped_release>())
       .def("enable_pv_merge", &framework::Dataset::EnablePvMerge,
-           py::call_guard<py::gil_scoped_release>())
-      .def("set_heter_ps", &framework::Dataset::SetHeterPs,
            py::call_guard<py::gil_scoped_release>());
 
   py::class_<IterableDatasetWrapper>(*m, "IterableDatasetWrapper")

From a9ea41c5e251e2cf8b15d286e938a961d8c1cb28 Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Wed, 29 Sep 2021 15:10:03 +0800
Subject: [PATCH 07/80] Spinlock (#36030)

* add align for WorkQueue

* add spinlock

* merge spinlock
---
 .../fluid/framework/new_executor/run_queue.h  | 10 +++--
 .../fluid/framework/new_executor/workqueue.cc |  4 +-
 .../framework/new_executor/workqueue_utils.h  |  1 +
 paddle/fluid/memory/allocation/spin_lock.h    | 43 ++++++++++++-------
 4 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h
index 13035237ff8b4..e457b20a3c35d 100644
--- a/paddle/fluid/framework/new_executor/run_queue.h
+++ b/paddle/fluid/framework/new_executor/run_queue.h
@@ -37,6 +37,8 @@
 #include <cstdint>
 #include <mutex>
 #include <vector>
+#include "paddle/fluid/framework/new_executor/workqueue_utils.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
 
 namespace paddle {
 namespace framework {
@@ -101,7 +103,7 @@ class RunQueue {
   // PushBack adds w at the end of the queue.
   // If queue is full returns w, otherwise returns default-constructed Work.
   Work PushBack(Work w) {
-    std::unique_lock<std::mutex> lock(mutex_);
+    std::unique_lock<paddle::memory::SpinLock> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[(back - 1) & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
@@ -123,7 +125,7 @@ class RunQueue {
       return Work();
     }
 
-    std::unique_lock<std::mutex> lock(mutex_);
+    std::unique_lock<paddle::memory::SpinLock> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
@@ -145,7 +147,7 @@ class RunQueue {
       return 0;
     }
 
-    std::unique_lock<std::mutex> lock(mutex_);
+    std::unique_lock<paddle::memory::SpinLock> lock(mutex_);
     unsigned back = back_.load(std::memory_order_relaxed);
     unsigned size = Size();
     unsigned mid = back;
@@ -213,7 +215,7 @@ class RunQueue {
   // modification counters.
   alignas(64) std::atomic<unsigned> front_;
   alignas(64) std::atomic<unsigned> back_;
-  std::mutex mutex_;
+  paddle::memory::SpinLock mutex_;
   Elem array_[kSize];
 
   // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc
index bc5a4e27dc528..8c6eeab4d5c0a 100644
--- a/paddle/fluid/framework/new_executor/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue.cc
@@ -166,7 +166,7 @@ std::unique_ptr<WorkQueue> CreateMultiThreadedWorkQueue(
                                         "WorkQueueOptions.num_threads must be "
                                         "greater than 1."));
   std::unique_ptr<WorkQueue> ptr(new WorkQueueImpl(options));
-  return ptr;
+  return std::move(ptr);
 }
 
 std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
@@ -176,7 +176,7 @@ std::unique_ptr<WorkQueueGroup> CreateWorkQueueGroup(
                         "For a WorkQueueGroup, the number of WorkQueueOptions "
                         "must be greater than 1."));
   std::unique_ptr<WorkQueueGroup> ptr(new WorkQueueGroupImpl(queues_options));
-  return ptr;
+  return std::move(ptr);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h
index 6907f2f17da0d..bb219fea36267 100644
--- a/paddle/fluid/framework/new_executor/workqueue_utils.h
+++ b/paddle/fluid/framework/new_executor/workqueue_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <cassert>
 #include <cstddef>
 #include <cstdlib>
diff --git a/paddle/fluid/memory/allocation/spin_lock.h b/paddle/fluid/memory/allocation/spin_lock.h
index 42462fd74b4cd..2bbe340e7c691 100644
--- a/paddle/fluid/memory/allocation/spin_lock.h
+++ b/paddle/fluid/memory/allocation/spin_lock.h
@@ -15,37 +15,48 @@
 #pragma once
 
 #include <atomic>
-#if !defined(_WIN32)
-#include <sched.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
+#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(__i386__)
+#define __PADDLE_x86__
+#include <immintrin.h>
+#endif
+#include <thread>
 
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace memory {
+static inline void CpuRelax() {
+#if defined(__PADDLE_x86__)
+  _mm_pause();
+#endif
+}
 
 class SpinLock {
  public:
   SpinLock() : mlock_(false) {}
 
   void lock() {
-    bool expect = false;
-    uint64_t spin_cnt = 0;
-    while (!mlock_.compare_exchange_weak(expect, true)) {
-      expect = false;
-      if ((++spin_cnt & 0xFF) == 0) {
-#if defined(_WIN32)
-        SleepEx(50, FALSE);
-#else
-        sched_yield();
-#endif
+    for (;;) {
+      if (!mlock_.exchange(true, std::memory_order_acquire)) {
+        break;
+      }
+      constexpr int kMaxLoop = 32;
+      for (int loop = 1; mlock_.load(std::memory_order_relaxed);) {
+        if (loop <= kMaxLoop) {
+          for (int i = 1; i <= loop; ++i) {
+            CpuRelax();
+          }
+          loop *= 2;
+        } else {
+          std::this_thread::yield();
+        }
       }
     }
   }
 
-  void unlock() { mlock_.store(false); }
+  void unlock() { mlock_.store(false, std::memory_order_release); }
+
   DISABLE_COPY_AND_ASSIGN(SpinLock);
 
  private:

From 1f93582cd1f13a09971e2c03334d649d82238e5b Mon Sep 17 00:00:00 2001
From: levi131 <83750468+levi131@users.noreply.github.com>
Date: Wed, 29 Sep 2021 16:24:59 +0800
Subject: [PATCH 08/80] Add functional autograd API:hessian (#36108)

* init functional jacobian api

* finish test with dtype float32

* add float64 test case

* polish code

* use atol=1e-5 with dtype float64

* fix for ci

* set timeout for test_jacobian

* init hessian API

* save status

* polish API docstring

* modify docstring

* add utils.py

* save status

* fix dygraph double grad dtype error when calling for high differential senario

* reinvoke ci

* test_hessian.py is ok

* polish hessian API

* init vhp

* Revert "init vhp"

This reverts commit cbd4d3b66abe82b0ac10721b9eddeb7d82e0a1c8.

* add test for partial_engine.cc

* modify numerical_delta with dtype float32

* merge fix for dtype float64

* spell fix

* polish code

* rm _stop_gradient_pre_process

Co-authored-by: JiabinYang <360788950@qq.com>
---
 python/paddle/autograd/__init__.py            |   2 +-
 python/paddle/autograd/functional.py          | 152 +++++++++++++++---
 python/paddle/autograd/utils.py               |  49 ++++++
 .../tests/unittests/autograd/CMakeLists.txt   |   1 +
 .../tests/unittests/autograd/test_hessian.py  | 140 ++++++++++++++++
 .../tests/unittests/autograd/test_jacobian.py |  60 +------
 .../fluid/tests/unittests/autograd/utils.py   | 107 ++++++++++++
 7 files changed, 426 insertions(+), 85 deletions(-)
 create mode 100644 python/paddle/autograd/utils.py
 create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_hessian.py
 create mode 100644 python/paddle/fluid/tests/unittests/autograd/utils.py

diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py
index dfbb3cfb45f2b..f4a0122759dc5 100644
--- a/python/paddle/autograd/__init__.py
+++ b/python/paddle/autograd/__init__.py
@@ -18,6 +18,6 @@
 from .py_layer import PyLayer, PyLayerContext  # noqa: F401
 from ..framework import set_grad_enabled  # noqa: F401
 from ..fluid.dygraph.base import no_grad_ as no_grad  # noqa: F401
-from .functional import jacobian  # noqa: F401
+from .functional import jacobian, hessian  # noqa: F401
 
 __all__ = ['backward', 'PyLayer', 'PyLayerContext']
diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py
index c1b4dd9e3a2db..a5665631c937f 100644
--- a/python/paddle/autograd/functional.py
+++ b/python/paddle/autograd/functional.py
@@ -13,34 +13,10 @@
 # limitations under the License.
 
 from paddle.fluid import framework
+from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor
 import paddle
 
 
-def _check_tensors(in_out_list, name):
-    assert in_out_list is not None, "{} should not be None".format(name)
-
-    if isinstance(in_out_list, (list, tuple)):
-        assert len(in_out_list) > 0, "{} connot be empyt".format(name)
-        for each_var in in_out_list:
-            assert isinstance(
-                each_var,
-                paddle.Tensor), "Elements of {} must be paddle.Tensor".format(
-                    name)
-        return in_out_list
-    else:
-        assert isinstance(
-            in_out_list,
-            paddle.Tensor), "{} must be Tensor or list of Tensor".format(name)
-        return [in_out_list]
-
-
-def _stack_tensor_or_return_none(origin_list):
-    assert len(origin_list) > 0, "Can't not stack an empty list"
-    return paddle.stack(
-        origin_list, axis=0) if isinstance(origin_list[0],
-                                           paddle.Tensor) else None
-
-
 @framework.dygraph_only
 def jacobian(func, inputs, create_graph=False, allow_unused=False):
     ''' 
@@ -183,3 +159,129 @@ def func(x, y):
         return jacobian[0]
     else:
         return jacobian
+
+
+@framework.dygraph_only
+def hessian(func, inputs, create_graph=False, allow_unused=False):
+    ''' 
+    .. note::
+        **This API is ONLY available in imperative mode.**
+
+    This API computes the Hessian matrix of `func` with respect to `inputs`.
+
+    Parameters:
+        func (function): a Python function that takes a Tensor or a Tensor
+            list/tuple as inputs and returns a Tensor with a single element.
+        inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or 
+            Tensor list/tuple of the function ``func``.
+        create_graph (bool, optional): whether to create the gradient graphs
+            of the computing process. When it is True, higher order derivatives
+            are supported to compute; when it is False, the gradient graphs of
+            the computing process would be discarded. Defaults to ``False``.
+        allow_unused (bool, optional): whether to raise error or return None if
+            some Tensors of `inputs` are unreachable in the graph. Error would
+            be raised if allow_unused=False, and None would be returned as
+            their gradients if allow_unused=True. Default False.
+    Returns:
+        Hessian (Tensor or a tuple of tuple of Tensors): if function ``func``
+        takes a Tensor as ``inputs``, Hessian will be a single Tensor containing
+        the Hessian matrix for the linearized ``inputs`` Tensor. If function
+        ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will
+        be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the
+        Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``.
+        Here ``m`` and ``n`` denote the number of elements of the ``i`` th input
+        and the ``j`` th input respectively.
+
+    Examples 1:
+        .. code-block:: python
+
+            import paddle
+
+            def func(x):
+                return paddle.sum(paddle.matmul(x, x))
+            
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            x.stop_gradient = False
+            hessian = paddle.autograd.hessian(func, x)
+            print(hessian)
+            # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[2., 1., 1., 0.],
+            #         [1., 0., 2., 1.],
+            #         [1., 2., 0., 1.],
+            #         [0., 1., 1., 2.]])
+
+    Examples 2:
+        .. code-block:: python
+
+            import paddle
+
+            def func(x, y):
+                return paddle.sum(paddle.matmul(x, y))
+            
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            y = paddle.ones(shape=[2, 2], dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            hessian = paddle.autograd.hessian(func, [x, y])
+            print(hessian)
+            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[0., 0., 0., 0.],
+            #         [0., 0., 0., 0.],
+            #         [0., 0., 0., 0.],
+            #         [0., 0., 0., 0.]]),
+            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[1., 1., 0., 0.],
+            #         [0., 0., 1., 1.],
+            #         [1., 1., 0., 0.],
+            #         [0., 0., 1., 1.]])),
+            #  (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[1., 0., 1., 0.],
+            #         [1., 0., 1., 0.],
+            #         [0., 1., 0., 1.],
+            #         [0., 1., 0., 1.]]),
+            #   Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[0., 0., 0., 0.],
+            #         [0., 0., 0., 0.],
+            #         [0., 0., 0., 0.],
+            #         [0., 0., 0., 0.]])))
+
+    Examples 3:
+        .. code-block:: python
+
+            import paddle
+
+            def func(x, y):
+                return paddle.sum(paddle.matmul(x, x))
+            
+            x = paddle.ones(shape=[2, 2], dtype='float32')
+            y = paddle.ones(shape=[2, 2], dtype='float32')
+            x.stop_gradient = False
+            y.stop_gradient = False
+            hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True)
+            print(hessian)
+            # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #        [[2., 1., 1., 0.],
+            #         [1., 0., 2., 1.],
+            #         [1., 2., 0., 1.],
+            #         [0., 1., 1., 2.]]), None), (None, None))
+
+    '''
+    inputs = _check_tensors(inputs, "inputs")
+    outputs = func(*inputs)
+    assert isinstance(outputs, paddle.Tensor) and outputs.shape == [
+        1
+    ], "The function to compute Hessian matrix should return a Tensor with a single element"
+
+    def jac_func(*ins):
+        grad_inputs = paddle.grad(
+            outputs,
+            ins,
+            create_graph=True,
+            retain_graph=True,
+            allow_unused=allow_unused)
+        return tuple(
+            _replace_none_with_zero_tensor(grad_inputs[i], inputs[i])
+            for i in range(len(inputs)))
+
+    return jacobian(
+        jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused)
diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py
new file mode 100644
index 0000000000000..d437f7d82d361
--- /dev/null
+++ b/python/paddle/autograd/utils.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def _check_tensors(in_out_list, name):
+    assert in_out_list is not None, "{} should not be None".format(name)
+
+    if isinstance(in_out_list, (list, tuple)):
+        assert len(in_out_list) > 0, "{} connot be empyt".format(name)
+        for each_var in in_out_list:
+            assert isinstance(
+                each_var,
+                paddle.Tensor), "Elements of {} must be paddle.Tensor".format(
+                    name)
+        return list(in_out_list)
+    else:
+        assert isinstance(
+            in_out_list,
+            paddle.Tensor), "{} must be Tensor or list of Tensor".format(name)
+        return [in_out_list]
+
+
+def _stack_tensor_or_return_none(origin_list):
+    assert len(origin_list) > 0, "Can't not stack an empty list"
+    return paddle.stack(
+        origin_list, axis=0) if isinstance(origin_list[0],
+                                           paddle.Tensor) else None
+
+
+def _replace_none_with_zero_tensor(t, spec_t):
+    if t is None:
+        zero_t = paddle.zeros(shape=spec_t.shape, dtype=spec_t.dtype)
+        zero_t.stop_gradient = spec_t.stop_gradient
+        return zero_t
+    else:
+        return t
diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 7f7a232fcefa6..1e9d433ebce8e 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -7,3 +7,4 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 
 set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20)
+set_tests_properties(test_hessian PROPERTIES TIMEOUT 20)
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_hessian.py b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py
new file mode 100644
index 0000000000000..120a6c853e8d8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.compat as cpt
+from utils import _compute_numerical_hessian
+
+
+class TestHessian(unittest.TestCase):
+    @classmethod
+    def setUpClass(self):
+        self.shape = (2, 2)
+        self.dtype = 'float32'
+        self.np_dtype = np.float32
+        self.numerical_delta = 1e-2
+        self.rtol = 1e-2
+        self.atol = 1e-2
+        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
+        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
+
+    def test_single_input(self):
+        def func(x):
+            return paddle.sum(paddle.matmul(x, x))
+
+        numerical_hessian = _compute_numerical_hessian(
+            func, self.x, self.numerical_delta, self.np_dtype)
+
+        self.x.stop_gradient = False
+        hessian = paddle.autograd.hessian(func, self.x)
+        assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol,
+                           self.atol)
+
+    def test_multi_input(self):
+        def func(x, y):
+            return paddle.sum(paddle.matmul(x, y))
+
+        numerical_hessian = _compute_numerical_hessian(
+            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
+
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        hessian = paddle.autograd.hessian(func, [self.x, self.y])
+        for i in range(len(hessian)):
+            for j in range(len(hessian[0])):
+                assert np.allclose(hessian[i][j].numpy(),
+                                   numerical_hessian[i][j], self.rtol,
+                                   self.atol)
+
+    def test_allow_unused_false(self):
+        def func(x, y):
+            return paddle.sum(paddle.matmul(x, x))
+
+        try:
+            self.x.stop_gradient = False
+            self.y.stop_gradient = False
+            hessian = paddle.autograd.hessian(func, [self.x, self.y])
+        except ValueError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_allow_unused_true(self):
+        def func(x, y):
+            return paddle.sum(paddle.matmul(x, x))
+
+        numerical_hessian = _compute_numerical_hessian(
+            func, [self.x, self.y], self.numerical_delta, self.np_dtype)
+        self.x.stop_gradient = False
+        self.y.stop_gradient = False
+        hessian = paddle.autograd.hessian(
+            func, [self.x, self.y], allow_unused=True)
+        for i in range(len(hessian)):
+            for j in range(len(hessian[0])):
+                if i == j == 0:
+                    assert np.allclose(hessian[i][j].numpy(),
+                                       numerical_hessian[i][j], self.rtol,
+                                       self.atol)
+                else:
+                    assert hessian[i][j] is None
+
+    def test_create_graph_false(self):
+        def func(x):
+            return paddle.sum(paddle.matmul(x, x))
+
+        numerical_hessian = _compute_numerical_hessian(
+            func, self.x, self.numerical_delta, self.np_dtype)
+        self.x.stop_gradient = False
+        hessian = paddle.autograd.hessian(func, self.x)
+        assert hessian.stop_gradient == True
+        assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol,
+                           self.atol)
+        try:
+            paddle.grad(hessian, self.x)
+        except RuntimeError as e:
+            error_msg = cpt.get_exception_message(e)
+            assert error_msg.find("has no gradient") > 0
+
+    # TODO(levi): enable this test case when matmul_grad_grad_grad is ok
+    def _test_create_graph_true(self):
+        def func(x):
+            return paddle.sum(paddle.matmul(x, x))
+
+        numerical_hessian = _compute_numerical_hessian(
+            func, self.x, self.numerical_delta, self.np_dtype)
+        self.x.stop_gradient = False
+        hessian = paddle.autograd.hessian(func, self.x, create_graph=True)
+        assert hessian.stop_gradient == False
+        assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol,
+                           self.atol)
+        triple_grad = paddle.grad(hessian, self.x)
+        assert triple_grad is not None
+
+
+class TestHessianFloat64(TestHessian):
+    @classmethod
+    def setUpClass(self):
+        self.shape = (2, 2)
+        self.dtype = 'float64'
+        self.np_dtype = np.float64
+        self.numerical_delta = 1e-5
+        self.rtol = 1e-5
+        self.atol = 1e-5
+        self.x = paddle.rand(shape=self.shape, dtype=self.dtype)
+        self.y = paddle.rand(shape=self.shape, dtype=self.dtype)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py
index 2722d2c83b130..2f0b8c7cad3e5 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py
@@ -16,65 +16,7 @@
 import numpy as np
 import paddle
 import paddle.compat as cpt
-from paddle.autograd.functional import _check_tensors
-
-
-def _product(t):
-    if isinstance(t, int):
-        return t
-    else:
-        return np.product(t)
-
-
-def _get_item(t, idx):
-    assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor."
-    assert isinstance(idx,
-                      int), "The second argument idx must be an int number."
-    flat_t = paddle.reshape(t, [-1])
-    return flat_t.__getitem__(idx)
-
-
-def _set_item(t, idx, value):
-    assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor."
-    assert isinstance(idx,
-                      int), "The second argument idx must be an int number."
-    flat_t = paddle.reshape(t, [-1])
-    flat_t.__setitem__(idx, value)
-    return paddle.reshape(flat_t, t.shape)
-
-
-def _compute_numerical_jacobian(func, xs, delta, np_dtype):
-    xs = _check_tensors(xs, "xs")
-    ys = _check_tensors(func(*xs), "ys")
-    fin_size = len(xs)
-    fout_size = len(ys)
-    jacobian = list([] for _ in range(fout_size))
-    for i in range(fout_size):
-        jac_i = list([] for _ in range(fin_size))
-        for j in range(fin_size):
-            jac_i[j] = np.zeros(
-                (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype)
-        jacobian[i] = jac_i
-
-    for j in range(fin_size):
-        for q in range(_product(xs[j].shape)):
-            orig = _get_item(xs[j], q)
-            x_pos = orig + delta
-            xs[j] = _set_item(xs[j], q, x_pos)
-            ys_pos = _check_tensors(func(*xs), "ys_pos")
-
-            x_neg = orig - delta
-            xs[j] = _set_item(xs[j], q, x_neg)
-            ys_neg = _check_tensors(func(*xs), "ys_neg")
-
-            xs[j] = _set_item(xs[j], q, orig)
-
-            for i in range(fout_size):
-                for p in range(_product(ys[i].shape)):
-                    y_pos = _get_item(ys_pos[i], p)
-                    y_neg = _get_item(ys_neg[i], p)
-                    jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2.
-    return jacobian
+from utils import _compute_numerical_jacobian
 
 
 class TestJacobian(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py
new file mode 100644
index 0000000000000..0aadef4a809f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/autograd/utils.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+from paddle.autograd.functional import _check_tensors
+
+
+def _product(t):
+    if isinstance(t, int):
+        return t
+    else:
+        return np.product(t)
+
+
+def _get_item(t, idx):
+    assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor."
+    assert isinstance(idx,
+                      int), "The second argument idx must be an int number."
+    flat_t = paddle.reshape(t, [-1])
+    return flat_t.__getitem__(idx)
+
+
+def _set_item(t, idx, value):
+    assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor."
+    assert isinstance(idx,
+                      int), "The second argument idx must be an int number."
+    flat_t = paddle.reshape(t, [-1])
+    flat_t.__setitem__(idx, value)
+    return paddle.reshape(flat_t, t.shape)
+
+
+def _compute_numerical_jacobian(func, xs, delta, np_dtype):
+    xs = _check_tensors(xs, "xs")
+    ys = _check_tensors(func(*xs), "ys")
+    fin_size = len(xs)
+    fout_size = len(ys)
+    jacobian = list([] for _ in range(fout_size))
+    for i in range(fout_size):
+        jac_i = list([] for _ in range(fin_size))
+        for j in range(fin_size):
+            jac_i[j] = np.zeros(
+                (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype)
+        jacobian[i] = jac_i
+
+    for j in range(fin_size):
+        for q in range(_product(xs[j].shape)):
+            orig = _get_item(xs[j], q)
+            x_pos = orig + delta
+            xs[j] = _set_item(xs[j], q, x_pos)
+            ys_pos = _check_tensors(func(*xs), "ys_pos")
+
+            x_neg = orig - delta
+            xs[j] = _set_item(xs[j], q, x_neg)
+            ys_neg = _check_tensors(func(*xs), "ys_neg")
+
+            xs[j] = _set_item(xs[j], q, orig)
+
+            for i in range(fout_size):
+                for p in range(_product(ys[i].shape)):
+                    y_pos = _get_item(ys_pos[i], p)
+                    y_neg = _get_item(ys_neg[i], p)
+                    jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2.
+    return jacobian
+
+
+def _compute_numerical_hessian(func, xs, delta, np_dtype):
+    xs = _check_tensors(xs, "xs")
+    ys = _check_tensors(func(*xs), "ys")
+    fin_size = len(xs)
+    hessian = list([] for _ in range(fin_size))
+    for i in range(fin_size):
+        hessian_i = list([] for _ in range(fin_size))
+        for j in range(fin_size):
+            hessian_i[j] = np.zeros(
+                (_product(xs[i].shape), _product(xs[j].shape)), dtype=np_dtype)
+        hessian[i] = hessian_i
+
+    for i in range(fin_size):
+        for p in range(_product(xs[i].shape)):
+            for j in range(fin_size):
+                for q in range(_product(xs[j].shape)):
+                    orig = _get_item(xs[j], q)
+                    x_pos = orig + delta
+                    xs[j] = _set_item(xs[j], q, x_pos)
+                    jacobian_pos = _compute_numerical_jacobian(func, xs, delta,
+                                                               np_dtype)
+                    x_neg = orig - delta
+                    xs[j] = _set_item(xs[j], q, x_neg)
+                    jacobian_neg = _compute_numerical_jacobian(func, xs, delta,
+                                                               np_dtype)
+                    xs[j] = _set_item(xs[j], q, orig)
+                    hessian[i][j][p][q] = (
+                        jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p]
+                    ) / delta / 2.
+    return hessian

From 3eb50715a53279c5df82c9d2c0c60802aef5387e Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Wed, 29 Sep 2021 16:50:35 +0800
Subject: [PATCH 09/80] fix cusparse compile problem, test=develop (#36199)

* fix cusparse compile problem, test=develop

* Modify file permissions
---
 paddle/fluid/platform/dynload/cusparse.cc     |  4 ++++
 paddle/fluid/platform/dynload/cusparse.h      | 20 +++++++++++++------
 .../unittests/test_sparse_attention_op.py     |  8 ++++----
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
index 2b41da541d9ae..2a1fe322dabcf 100644
--- a/paddle/fluid/platform/dynload/cusparse.cc
+++ b/paddle/fluid/platform/dynload/cusparse.cc
@@ -26,6 +26,10 @@ void *cusparse_dso_handle;
 #ifdef CUSPARSE_ROUTINE_EACH
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
+
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
+CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
index 98841949676e4..e5be003fadf06 100644
--- a/paddle/fluid/platform/dynload/cusparse.h
+++ b/paddle/fluid/platform/dynload/cusparse.h
@@ -41,8 +41,9 @@ extern void *cusparse_dso_handle;
   };                                                                 \
   extern DynLoad__##__name __name
 
-#ifndef _WIN32
-#if CUDA_VERSION >= 11020
+#if !defined(PADDLE_WITH_ARM) && !defined(_WIN32)
+// APIs available after CUDA 11.0
+#if CUDA_VERSION >= 11000
 #define CUSPARSE_ROUTINE_EACH(__macro) \
   __macro(cusparseCreate);             \
   __macro(cusparseCreateCsr);          \
@@ -51,12 +52,19 @@ extern void *cusparse_dso_handle;
   __macro(cusparseSpMM);               \
   __macro(cusparseDestroySpMat);       \
   __macro(cusparseDestroyDnMat);       \
-  __macro(cusparseDestroy);            \
-  __macro(cusparseSDDMM_bufferSize);   \
-  __macro(cusparseSDDMM_preprocess);   \
-  __macro(cusparseSDDMM);
+  __macro(cusparseDestroy);
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
+
+// APIs available after CUDA 11.2
+#if CUDA_VERSION >= 11020
+#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
+  __macro(cusparseSDDMM_bufferSize);      \
+  __macro(cusparseSDDMM_preprocess);      \
+  __macro(cusparseSDDMM);
+
+CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
 #endif
 #endif
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
index ad618edd24d55..48401fb55ef3f 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -169,13 +169,13 @@ def setUp(self):
             'Q': self.q,
             'K': self.k,
             'V': self.v,
-            'offset': self.offset,
-            'columns': self.columns
+            'Offset': self.offset,
+            'Columns': self.columns
         }
         self.outputs = {
             'Out': result.astype(self.dtype),
-            'ResultSdd': result_sdd.astype(self.dtype),
-            'ResultSoftmax': result_softmax.astype(self.dtype)
+            'SparseDotSdd': result_sdd.astype(self.dtype),
+            'Softmax': result_softmax.astype(self.dtype)
         }
 
     def test_check_output(self):

From 69eed34d1dd5b38e2810b0bafe0cac075fdd0d2e Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Wed, 29 Sep 2021 17:02:04 +0800
Subject: [PATCH 10/80] add optest for adamw (#36148)

* update func name

* skip cpu

* update unittest

* update unittest
---
 .../fluid/tests/unittests/test_adamw_op.py    | 166 +++++++++++++++++-
 python/paddle/optimizer/adamw.py              |   6 +-
 2 files changed, 165 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 2a5dc76c6bb28..0a60f4cba09bc 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -14,9 +14,153 @@
 
 import unittest
 import paddle
+import random
 import numpy as np
 import paddle.fluid as fluid
+from op_test import OpTest
 from functools import partial
+from paddle.framework import core
+
+
+def adamw_step(inputs, attributes):
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    epsilon = attributes['epsilon']
+
+    if 'lr_ratio' in attributes:
+        lr = lr * attributes['lr_ratio']
+
+    if attributes["with_decay"]:
+        coeff = attributes["coeff"]
+        decay = 1.0 - lr * coeff
+        param2 = param * decay
+        param = param2.copy()
+
+    if 'beta1' in attributes:
+        beta1 = attributes['beta1']
+    else:
+        beta1 = inputs['Beta1Tensor'][0]
+    if 'beta2' in attributes:
+        beta2 = attributes['beta2']
+    else:
+        beta2 = inputs['Beta2Tensor'][0]
+
+    moment1_out = beta1 * moment1 + (1 - beta1) * grad
+    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
+    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestAdamW(OpTest):
+    def setUp(self):
+        '''Test AdamW Op with supplied attributes
+        '''
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            "coeff": 0.5,
+            "with_decay": True
+        }
+
+        param_out, moment1_out, \
+            moment2_out = adamw_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestAdamW2(OpTest):
+    def setUp(self):
+        '''Test AdamW Op with supplied attributes
+        '''
+        self.op_type = "adamw"
+        param = np.random.uniform(-1, 1, (2, 2)).astype("float32")
+        grad = np.random.uniform(-1, 1, (2, 2)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (2, 2)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((2, 2)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {
+            'epsilon': epsilon,
+            'beta1': beta1,
+            'beta2': beta2,
+            "lr_ratio": 0.1,
+            "coeff": 0.5,
+            "with_decay": True
+        }
+
+        param_out, moment1_out, moment2_out = adamw_step(self.inputs,
+                                                         self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out,
+            'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
 
 
 class TestAdamWOp(unittest.TestCase):
@@ -160,7 +304,14 @@ def simple_lr_setting(param, decay_rate, n_layers):
     return decay_rate**(n_layers + 2 - depth)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
 class TestAdamWOpLayerwiseLR(TestAdamWOp):
+    def setUp(self):
+        random.seed(2021)
+        np.random.seed(2021)
+        paddle.seed(2021)
+
     def test_adamw_op_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -181,17 +332,20 @@ def test_adamw_op_dygraph(self):
             weight_decay=0.01,
             lr_ratio=simple_lr_fun)
 
-        for _ in range(2):
+        loss_ref = np.array(
+            [4.8383293, 3.0854003, 1.33299, -0.418993, -2.171043])
+        for i in range(5):
             a1 = linear1(a)
             out = linear2(a1)
+            out = paddle.mean(out)
             out.backward()
             adam.step()
             adam.clear_gradients()
+            np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6)
 
     def test_adamw_op(self):
         paddle.enable_static()
-        place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
-            else fluid.CPUPlace()
+        place = fluid.CUDAPlace(0)
         train_prog = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(train_prog, startup):
@@ -223,7 +377,10 @@ def test_adamw_op(self):
 
         exe = fluid.Executor(place)
         exe.run(startup)
-        for _ in range(2):
+
+        loss_ref = np.array(
+            [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626])
+        for i in range(5):
             inputs = np.random.random(size=[8, 10]).astype('float32')
             outputs = np.random.random(size=[8, 1]).astype('float32')
             rets = exe.run(train_prog,
@@ -231,6 +388,7 @@ def test_adamw_op(self):
                                  "y": outputs},
                            fetch_list=[avg_cost])
             assert rets[0] is not None
+            np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6)
 
         paddle.disable_static()
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 34fb201d8ccaf..f26ee80d0af60 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -171,9 +171,9 @@ def __init__(self,
         self._lr_to_coeff = dict()
         if lr_ratio is not None:
             assert isinstance(lr_ratio, Callable)
-            if core.is_compiled_with_xpu() or core.is_compiled_with_npu():
+            if not core.is_compiled_with_cuda():
                 raise NotImplementedError(
-                    "'lr_ratio' is unimplemented in XPU and NPU")
+                    "'lr_ratio' is unimplemented in CPU, XPU and NPU")
         self._lr_ratio = lr_ratio
 
         super(AdamW, self).__init__(
@@ -305,7 +305,7 @@ def _append_optimize_op(self, block, param_and_grad):
                 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
                 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
                 'beta2', _beta2, 'coeff', self._coeff, 'multi_precision',
-                find_master, "lr_ratio", lr_ratio_)
+                find_master, 'lr_ratio', lr_ratio_)
 
             return None
 

From 21b93c3dc68c616f12c360ebbbd9961fe379902f Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Wed, 29 Sep 2021 17:12:17 +0800
Subject: [PATCH 11/80] Add basic support for CUDA Graph (#36190)

* add basic support for CUDA Graph

* fix ci compile error

* fix LOG print, fix windows CI

* follow comments and update

* small fix for default ctor

* fix rocm compile error

* fix CPU compile error
---
 paddle/fluid/memory/allocation/CMakeLists.txt |   6 +-
 .../memory/allocation/allocator_facade.cc     | 147 ++++++++++++++++--
 .../memory/allocation/allocator_facade.h      |   8 +
 .../auto_growth_best_fit_allocator.cc         |   8 +-
 .../auto_growth_best_fit_allocator.h          |   3 +-
 paddle/fluid/platform/CMakeLists.txt          |   5 +
 paddle/fluid/platform/cuda_graph.cc           |  92 +++++++++++
 paddle/fluid/platform/cuda_graph.h            | 136 ++++++++++++++++
 .../platform/cuda_graph_with_memory_pool.cc   |  43 +++++
 .../platform/cuda_graph_with_memory_pool.h    |  64 ++++++++
 paddle/fluid/platform/gpu_info.cc             |   2 +
 paddle/fluid/platform/type_defs.h             |   1 +
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/fluid/pybind/pybind.cc                 |  15 ++
 python/paddle/device/cuda/graphs.py           |  57 +++++++
 .../fluid/tests/unittests/test_cuda_graph.py  |  60 +++++++
 16 files changed, 634 insertions(+), 15 deletions(-)
 create mode 100644 paddle/fluid/platform/cuda_graph.cc
 create mode 100644 paddle/fluid/platform/cuda_graph.h
 create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.cc
 create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.h
 create mode 100644 python/paddle/device/cuda/graphs.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph.py

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 6b4afae9f8c75..4aa1900f53f5e 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -82,7 +82,11 @@ endif()
 cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
 cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
 cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy )
+cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy)
+
+if (WITH_GPU)
+  target_link_libraries(allocator_facade cuda_graph)
+endif()
 
 cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
 if (WITH_TESTING)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 78bce53b6f4ff..0388e2d13afb0 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -32,6 +32,9 @@
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_graph.h"
+#endif
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
@@ -47,17 +50,64 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Whether to use system allocator to allocate CPU and GPU memory. "
     "Only used for unittests.");
 
+DECLARE_string(allocator_strategy);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
 
+#ifdef PADDLE_WITH_CUDA
+class CUDAGraphAllocator
+    : public Allocator,
+      public std::enable_shared_from_this<CUDAGraphAllocator> {
+ private:
+  class PrivateAllocation : public Allocation {
+   public:
+    PrivateAllocation(CUDAGraphAllocator* allocator,
+                      AllocationPtr underlying_allocation)
+        : Allocation(underlying_allocation->ptr(),
+                     underlying_allocation->size(),
+                     underlying_allocation->place()),
+          allocator_(allocator->shared_from_this()),
+          underlying_allocation_(std::move(underlying_allocation)) {}
+
+   private:
+    std::shared_ptr<Allocator> allocator_;
+    AllocationPtr underlying_allocation_;
+  };
+
+  explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
+      : underlying_allocator_(allocator) {}
+
+ public:
+  static std::shared_ptr<Allocator> Create(
+      const std::shared_ptr<Allocator>& allocator) {
+    return std::shared_ptr<Allocator>(new CUDAGraphAllocator(allocator));
+  }
+
+ protected:
+  Allocation* AllocateImpl(size_t size) {
+    VLOG(10) << "Allocate " << size << " for CUDA Graph";
+    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+  }
+
+  void FreeImpl(Allocation* allocation) {
+    VLOG(10) << "delete for CUDA Graph";
+    delete allocation;
+  }
+
+ private:
+  std::shared_ptr<Allocator> underlying_allocator_;
+};
+#endif
+
 class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-  AllocatorFacadePrivate() {
-    auto strategy = GetAllocatorStrategy();
-    switch (strategy) {
+  explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
+    strategy_ = GetAllocatorStrategy();
+    switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
 #ifdef PADDLE_WITH_XPU
@@ -91,7 +141,8 @@ class AllocatorFacadePrivate {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
              ++dev_id) {
-          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id));
+          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                      allow_free_idle_chunk);
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
@@ -117,7 +168,7 @@ class AllocatorFacadePrivate {
 
       default: {
         PADDLE_THROW(platform::errors::InvalidArgument(
-            "Unsupported allocator strategy: %d", static_cast<int>(strategy)));
+            "Unsupported allocator strategy: %d", static_cast<int>(strategy_)));
       }
     }
     InitZeroSizeAllocators();
@@ -130,11 +181,29 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
   }
 
+  inline const AllocatorMap& GetAllocatorMap() {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      auto id = platform::CUDAGraph::CapturingID();
+      auto iter = cuda_graph_allocator_map_.find(id);
+      PADDLE_ENFORCE_NE(
+          iter, cuda_graph_allocator_map_.end(),
+          platform::errors::PermissionDenied(
+              "No memory pool is prepared for CUDA Graph capturing."));
+      return iter->second->allocators_;
+    } else {
+      return allocators_;
+    }
+#else
+    return allocators_;
+#endif
+  }
+
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
-                                                          : allocators_)
+                                                          : GetAllocatorMap())
                   : zero_size_allocators_);
     auto iter = allocators.find(place);
     PADDLE_ENFORCE_NE(iter, allocators.end(),
@@ -145,6 +214,7 @@ class AllocatorFacadePrivate {
 
  private:
   void InitSystemAllocators() {
+    if (!system_allocators_.empty()) return;
     system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
 #ifdef PADDLE_WITH_XPU
     int device_count = platform::GetXPUDeviceCount();
@@ -183,10 +253,11 @@ class AllocatorFacadePrivate {
     allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
   }
 
-  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) {
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
+                                   bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize());
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
   }
 #endif
 
@@ -226,6 +297,7 @@ class AllocatorFacadePrivate {
   };
 
   void InitZeroSizeAllocators() {
+    if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -279,12 +351,57 @@ class AllocatorFacadePrivate {
     }
   }
 
+#ifdef PADDLE_WITH_CUDA
+
+ public:
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+                      platform::errors::InvalidArgument(
+                          "CUDA Graph is only supported when the "
+                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                          "FLAGS_allocator_strategy=\"%s\"",
+                          FLAGS_allocator_strategy));
+    auto& allocator = cuda_graph_allocator_map_[id];
+    PADDLE_ENFORCE_EQ(
+        allocator.get(), nullptr,
+        platform::errors::InvalidArgument(
+            "The memory pool of the CUDA Graph with ID %d have been prepared.",
+            id));
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    for (auto& item : allocator->allocators_) {
+      auto& old_allocator = item.second;
+      old_allocator = CUDAGraphAllocator::Create(old_allocator);
+    }
+    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  }
+
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+    auto iter = cuda_graph_allocator_map_.find(id);
+    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find CUDA Graph with ID = %d", id));
+    cuda_graph_allocator_map_.erase(iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
+  }
+#endif
+
  private:
   AllocatorMap allocators_;
-  AllocatorMap zero_size_allocators_;
-  AllocatorMap system_allocators_;
+#ifdef PADDLE_WITH_CUDA
+  std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
+      cuda_graph_allocator_map_;
+#endif
+  AllocatorStrategy strategy_;
+
+  static AllocatorMap zero_size_allocators_;
+  static AllocatorMap system_allocators_;
 };
 
+AllocatorFacadePrivate::AllocatorMap
+    AllocatorFacadePrivate::zero_size_allocators_;
+AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
+
 // Pimpl. Make interface clean.
 AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {}
 // delete m_ may cause core dump when the destructor of python in conflict with
@@ -316,6 +433,16 @@ const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
 }
 
+#ifdef PADDLE_WITH_CUDA
+void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+  return m_->PrepareMemoryPoolForCUDAGraph(id);
+}
+
+void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+  return m_->RemoveMemoryPoolOfCUDAGraph(id);
+}
+#endif
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 7f6ad561aa931..8d889ec38eed7 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -18,6 +18,9 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/gpu_info.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -54,6 +57,11 @@ class AllocatorFacade {
   uint64_t Release(const platform::Place& place);
   const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
 
+#ifdef PADDLE_WITH_CUDA
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id);
+#endif
+
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
   AllocatorFacade();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index a35d8a73f7eda..f36d589f907fb 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -39,11 +39,12 @@ namespace allocation {
 
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-    size_t chunk_size)
+    size_t chunk_size, bool allow_free_idle_chunk)
     : underlying_allocator_(
           std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
       alignment_(alignment),
-      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {}
+      chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
+      allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
 Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
   size = AlignedSize(size, alignment_);
@@ -139,6 +140,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
 }
 
 uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
+  if (!allow_free_idle_chunk_) {
+    return 0;
+  }
   uint64_t bytes = 0;
   for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
     auto &blocks = chunk_it->blocks_;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index 5ed6eb94f158f..d1fa6cce0164f 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
  public:
   AutoGrowthBestFitAllocator(
       const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
-      size_t chunk_size = 0);
+      size_t chunk_size = 0, bool allow_free_idle_chunk = true);
 
   bool IsAllocThreadSafe() const override { return true; }
 
@@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator {
   std::list<Chunk> chunks_;
   size_t alignment_;
   size_t chunk_size_;
+  bool allow_free_idle_chunk_;
 
   SpinLock spinlock_;
 };
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 2540170ed54fb..21213f9e6ff21 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
 cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
 IF(WITH_GPU)
+    nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade)
     nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
     nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce)
+    nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
+ELSE()
+    cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
 ENDIF()
+
 IF(WITH_ROCM)
     hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda)
 ENDIF()
diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc
new file mode 100644
index 0000000000000..6e518d779e9cd
--- /dev/null
+++ b/paddle/fluid/platform/cuda_graph.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/cuda_graph.h"
+
+namespace paddle {
+namespace platform {
+
+std::unique_ptr<CUDAGraph> CUDAGraph::capturing_graph_{nullptr};
+
+void CUDAGraph::Reset() {
+  if (is_reset_) return;
+#if CUDA_VERSION >= 10010
+  if (graph_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph_));
+    graph_ = nullptr;
+  }
+  if (exec_graph_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph_));
+    exec_graph_ = nullptr;
+  }
+#endif
+  // callback should be called in reverse order because the latter added
+  // callback may rely on the former added callback.
+  for (auto iter = callbacks_.rbegin(); iter != callbacks_.rend(); ++iter) {
+    (*iter)();
+  }
+  callbacks_.clear();
+  is_reset_ = true;
+}
+
+void CUDAGraph::Replay() {
+#if CUDA_VERSION >= 10010
+  PADDLE_ENFORCE_EQ(is_reset_, false,
+                    errors::PermissionDenied(
+                        "Cannot replay the CUDA Graph after reset is called."));
+  PADDLE_ENFORCE_NOT_NULL(exec_graph_,
+                          errors::PermissionDenied(
+                              "CUDA Graph must be captured before replaying."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph_, stream_));
+#endif
+}
+
+void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
+                             cudaStreamCaptureMode mode) {
+  ThrowErrorIfNotSupportCUDAGraph();
+  PADDLE_ENFORCE_EQ(
+      IsCapturing(), false,
+      errors::PermissionDenied("CUDA Graph can only captured one by one."));
+  PADDLE_ENFORCE_NOT_NULL(
+      stream, errors::PermissionDenied(
+                  "CUDA Graph cannot be captured in default CUDA stream 0."));
+  capturing_graph_.reset(new CUDAGraph());
+  capturing_graph_->place_ = place;
+  capturing_graph_->stream_ = stream;
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamBeginCapture(capturing_graph_->stream_, mode));
+  cudaStreamCaptureStatus status;
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo(
+      capturing_graph_->stream_, &status, &(capturing_graph_->id_)));
+  VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_;
+}
+
+std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
+  ThrowErrorIfNotSupportCUDAGraph();
+#if CUDA_VERSION >= 10010
+  PADDLE_ENFORCE_EQ(IsCapturing(), true,
+                    errors::PermissionDenied("No CUDA Graph is capturing."));
+  PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamEndCapture(
+      capturing_graph_->stream_, &(capturing_graph_->graph_)));
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaGraphInstantiate(&(capturing_graph_->exec_graph_),
+                           capturing_graph_->graph_, nullptr, nullptr, 0));
+  VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_;
+  return std::move(capturing_graph_);
+#endif
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h
new file mode 100644
index 0000000000000..41e36049aa1a0
--- /dev/null
+++ b/paddle/fluid/platform/cuda_graph.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <mutex>
+#include "cuda.h"          // NOLINT
+#include "cuda_runtime.h"  // NOLINT
+#include "paddle/fluid/platform/type_defs.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+#if CUDA_VERSION >= 10010
+static void ThrowErrorIfNotSupportCUDAGraph() {}
+#else
+enum cudaStreamCaptureMode {
+  cudaStreamCaptureModeGlobal = 0,
+  cudaStreamCaptureModeThreadLocal = 1,
+  cudaStreamCaptureModeRelaxed = 2
+};
+static void ThrowErrorIfNotSupportCUDAGraph() {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported when CUDA version >= 10.1"));
+}
+#endif
+
+// NOTE: Currently, we do not support to capture CUDA graph in parallel
+// NOTE: Do not use this class directly because it should be used with
+//       the memory pool.
+class CUDAGraph {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraph);
+
+  // Since the constructor would throw error is CUDA_VERSION < 10010.
+  // The non-static method of CUDAGraph need not check CUDA_VERSION
+  // again.
+  CUDAGraph() { ThrowErrorIfNotSupportCUDAGraph(); }
+
+ public:
+  ~CUDAGraph() { Reset(); }
+
+  CUDAGraphID ID() const { return id_; }
+
+  void Replay();
+
+  void Reset();
+
+  void AddResetCallback(std::function<void()> callback) {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callbacks_.push_back(std::move(callback));
+  }
+
+  static void BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
+                           cudaStreamCaptureMode mode);
+  static std::unique_ptr<CUDAGraph> EndCapture();
+  static void AddResetCallbackDuringCapturing(std::function<void()> callback) {
+    capturing_graph_->AddResetCallback(std::move(callback));
+  }
+
+  // No need to add CUDA_VERSION macro because capturing_graph_ would
+  // always be nullptr (constructor throws error)
+  static bool IsCapturing() { return capturing_graph_ != nullptr; }
+
+  static CUDAGraphID CapturingID() { return capturing_graph_->id_; }
+
+  static platform::CUDAPlace CapturingPlace() {
+    return capturing_graph_->place_;
+  }
+
+ private:
+#if CUDA_VERSION >= 10010
+  cudaGraph_t graph_{nullptr};
+  cudaGraphExec_t exec_graph_{nullptr};
+#endif
+  cudaStream_t stream_{nullptr};
+  platform::CUDAPlace place_;
+  CUDAGraphID id_{0};
+  std::vector<std::function<void()>> callbacks_;
+  bool is_reset_{false};
+  std::mutex mtx_;
+
+  static std::unique_ptr<CUDAGraph> capturing_graph_;
+};
+
+#if CUDA_VERSION >= 10010
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
+      // After cudaThreadExchangeStreamCaptureMode is called,
+      // the variable "mode" would be set to the old capturing mode.
+      old_mode_ = mode;
+    }
+  }
+
+  ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW {
+    if (UNLIKELY(CUDAGraph::IsCapturing())) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaThreadExchangeStreamCaptureMode(&old_mode_));
+    }
+  }
+
+ private:
+  cudaStreamCaptureMode old_mode_;
+};
+#else
+class CUDAGraphCaptureModeGuard {
+  DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
+
+ public:
+  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {}
+};
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
new file mode 100644
index 0000000000000..1f0d39e2abe23
--- /dev/null
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+#ifdef PADDLE_WITH_CUDA
+void BeginCUDAGraphCapture(platform::CUDAPlace place,
+                           cudaStreamCaptureMode mode) {
+  auto stream =
+      platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
+  CUDAGraph::BeginCapture(place, stream, mode);
+  auto id = CUDAGraph::CapturingID();
+  memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
+      id);
+  AddResetCallbackIfCapturingCUDAGraph([id] {
+    memory::allocation::AllocatorFacade::Instance().RemoveMemoryPoolOfCUDAGraph(
+        id);
+  });
+}
+
+std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
+  return CUDAGraph::EndCapture();
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
new file mode 100644
index 0000000000000..f9f0248e5153b
--- /dev/null
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_graph.h"
+#endif
+
+namespace paddle {
+namespace platform {
+
+// NOTE: These APIs are not thread-safe.
+#ifdef PADDLE_WITH_CUDA
+void BeginCUDAGraphCapture(platform::CUDAPlace place,
+                           cudaStreamCaptureMode mode);
+std::unique_ptr<CUDAGraph> EndCUDAGraphCapture();
+#endif
+
+inline bool IsCUDAGraphCapturing() {
+#ifdef PADDLE_WITH_CUDA
+  return CUDAGraph::IsCapturing();
+#else
+  return false;
+#endif
+}
+
+inline platform::CUDAPlace CUDAGraphCapturingPlace() {
+#ifdef PADDLE_WITH_CUDA
+  return CUDAGraph::CapturingPlace();
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
+// Add reset callback if CUDA Graph is capturing.
+// Otherwise, invoke callback directly.
+template <typename Callback>
+inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) {
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(IsCUDAGraphCapturing())) {
+    return CUDAGraph::AddResetCallbackDuringCapturing(
+        std::forward<Callback>(callback));
+  }
+#endif
+  callback();
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index c4ac5aa3046a9..59e4404ffe535 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
 #else
+#include "paddle/fluid/platform/cuda_graph.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #endif
 #include "paddle/fluid/memory/malloc.h"
@@ -557,6 +558,7 @@ class RecordedCudaMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto result = hipMalloc(ptr, size);
 #else
+    CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed};
     auto result = cudaMalloc(ptr, size);
 #endif
     if (result == gpuSuccess) {
diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/type_defs.h
index f46bd1a0bdfa4..88a2d16472fa7 100644
--- a/paddle/fluid/platform/type_defs.h
+++ b/paddle/fluid/platform/type_defs.h
@@ -36,4 +36,5 @@ using gpuEvent_t = cudaEvent_t;
 using gpuDeviceProp = cudaDeviceProp;
 #endif
 
+using CUDAGraphID = unsigned long long;  // NOLINT
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 22778013f2390..875e6af9652a2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -7,7 +7,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
-  cost_model)
+  cost_model cuda_graph_with_memory_pool)
 
 if (WITH_PSCORE)
   set(PYBIND_DEPS ${PYBIND_DEPS} ps_service)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a16916ab33f83..6b24c64492581 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -125,6 +125,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+
 #ifdef PADDLE_WITH_CRYPTO
 #include "paddle/fluid/pybind/crypto.h"
 #endif
@@ -520,6 +522,19 @@ PYBIND11_MODULE(core_noavx, m) {
   m.def("nccl_version", &GetNCCLVersion);
 #endif
 
+  m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing);
+#ifdef PADDLE_WITH_CUDA
+  py::class_<platform::CUDAGraph>(m, "CUDAGraph")
+      .def_static("begin_capture",
+                  [](platform::CUDAPlace place, int mode) {
+                    platform::BeginCUDAGraphCapture(
+                        place, static_cast<cudaStreamCaptureMode>(mode));
+                  })
+      .def_static("end_capture", &platform::EndCUDAGraphCapture)
+      .def("replay", &platform::CUDAGraph::Replay)
+      .def("reset", &platform::CUDAGraph::Reset);
+#endif
+
   m.def("wait_device", [](const platform::Place &place) {
     platform::DeviceContextPool::Instance().Get(place)->Wait();
   });
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
new file mode 100644
index 0000000000000..612f4d2c8cebd
--- /dev/null
+++ b/python/paddle/device/cuda/graphs.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
+
+if is_compiled_with_cuda() and not is_compiled_with_rocm():
+    from paddle.fluid.core import CUDAGraph as CoreCUDAGraph
+
+    class CUDAGraph:
+        def __init__(self, place=None, mode="thread_local"):
+            ALL_MODES = ["global", "thread_local", "relaxed"]
+            self._graph = None
+            if place is None:
+                place = CUDAPlace(0)
+            self._place = place
+            assert mode in ALL_MODES
+            self._mode = ALL_MODES.index(mode)
+
+        def capture_begin(self):
+            CoreCUDAGraph.begin_capture(self._place, self._mode)
+
+        def capture_end(self):
+            self._graph = CoreCUDAGraph.end_capture()
+
+        def replay(self):
+            self._graph.replay()
+
+        def reset(self):
+            self._graph.reset()
+else:
+
+    class CUDAGraph:
+        def __init__(self, place=None, mode="thread_local"):
+            raise NotImplementedError()
+
+        def capture_begin(self):
+            raise NotImplementedError()
+
+        def capture_end(self):
+            raise NotImplementedError()
+
+        def replay(self):
+            raise NotImplementedError()
+
+        def reset(self):
+            raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
new file mode 100644
index 0000000000000..272d68e17fcc4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid as fluid
+from paddle.device.cuda.graphs import CUDAGraph
+import unittest
+import numpy as np
+
+
+class TestCUDAGraph(unittest.TestCase):
+    def setUp(self):
+        fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'})
+
+    def random_tensor(self, shape):
+        return paddle.to_tensor(
+            np.random.randint(
+                low=0, high=10, size=shape).astype("float32"))
+
+    def test_cuda_graph(self):
+        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
+            return
+
+        shape = [2, 3]
+        x = self.random_tensor(shape)
+        z = self.random_tensor(shape)
+
+        g = CUDAGraph()
+        g.capture_begin()
+        y = x + 10
+        z.add_(x)
+        g.capture_end()
+
+        for _ in range(10):
+            z_np_init = z.numpy()
+            x_new = self.random_tensor(shape)
+            x.copy_(x_new, False)
+            g.replay()
+            x_np = x_new.numpy()
+            y_np = y.numpy()
+            z_np = z.numpy()
+            self.assertTrue((y_np - x_np == 10).all())
+            self.assertTrue((z_np - z_np_init == x_np).all())
+
+        g.reset()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8af939f16abf8a03fc4e30ffac267f9d75af7d13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com>
Date: Thu, 30 Sep 2021 10:13:23 +0800
Subject: [PATCH 12/80] fix the undefined variable bug in dist_transformer file
 (#36211)

---
 python/paddle/fluid/tests/unittests/dist_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 6546bb5549df8..db321f9417880 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1450,7 +1450,7 @@ def wrap_decoder(trg_vocab_size,
         # This is used to implement independent decoder program in inference.
         trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
         enc_output = make_all_inputs(
-            decoder_data_input_fields + decoder_util_input_fields)
+            decoder_data_input_fields)
     else:
         trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
 

From 5e0f199ab02e1f1458e49a9318f40fede2c0439e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com>
Date: Thu, 30 Sep 2021 10:15:40 +0800
Subject: [PATCH 13/80] Fix raw optim (#36176)

* fix raw optim

* pre-commit test file

Co-authored-by: sneaxiy <sneaxiy@126.com>
---
 .../meta_optimizers/raw_program_optimizer.py  |   2 +
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../fluid/tests/unittests/test_rnn_dp.py      | 157 ++++++++++++++++++
 3 files changed, 161 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_rnn_dp.py

diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
index 7d899cff41871..c8eaa54f9cda1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -460,6 +460,8 @@ def __get_ouputs_name_to_idx(self, first_backward_idx, block):
             if is_optimizer_op(op):
                 break
             for name in op.output_arg_names:
+                if name == core.kEmptyVarName():
+                    continue
                 var = block.var(name)
                 if not outputs_name_to_idx.get(var):
                     # if the grad only be generated by one op
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 129fbb9ac3328..cd1c4363879bb 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -21,6 +21,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
 list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
+list(APPEND DIST_TEST_OPS test_rnn_dp)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
@@ -66,6 +67,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_rnn_dp)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_dp.py b/python/paddle/fluid/tests/unittests/test_rnn_dp.py
new file mode 100644
index 0000000000000..8d7e86fcdb9c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_rnn_dp.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+import numpy as np
+import paddle
+import paddle.static as static
+import paddle.distributed.fleet as fleet
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+paddle.enable_static()
+
+
+class RNNEncoder(nn.Layer):
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 num_layers=1,
+                 direction="forward",
+                 dropout=0.0,
+                 pooling_type=None,
+                 **kwargs):
+        super().__init__()
+        self._input_size = input_size
+        self._hidden_size = hidden_size
+        self._direction = direction
+        self._pooling_type = pooling_type
+
+        self.rnn_layer = nn.SimpleRNN(
+            input_size=input_size,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            direction=direction,
+            dropout=dropout,
+            **kwargs)
+
+    def get_input_dim(self):
+        return self._input_size
+
+    def get_output_dim(self):
+        if self._direction == "bidirect":
+            return self._hidden_size * 2
+        else:
+            return self._hidden_size
+
+    def forward(self, inputs, sequence_length):
+        encoded_text, last_hidden = self.rnn_layer(
+            inputs, sequence_length=sequence_length)
+        output = paddle.max(encoded_text, axis=1)
+        return output
+
+
+class RNNModel(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_classes,
+                 emb_dim=128,
+                 padding_idx=0,
+                 rnn_hidden_size=198,
+                 direction='forward',
+                 rnn_layers=1,
+                 dropout_rate=0.0,
+                 pooling_type=None,
+                 fc_hidden_size=96):
+        super().__init__()
+        self.embedder = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=emb_dim,
+            padding_idx=padding_idx)
+        self.rnn_encoder = RNNEncoder(
+            emb_dim,
+            rnn_hidden_size,
+            num_layers=rnn_layers,
+            direction=direction,
+            dropout=dropout_rate,
+            pooling_type=pooling_type)
+        self.fc = nn.Linear(self.rnn_encoder.get_output_dim(), fc_hidden_size)
+        self.output_layer = nn.Linear(fc_hidden_size, num_classes)
+
+    def forward(self, text, seq_len):
+        embedded_text = self.embedder(text)
+        text_repr = self.rnn_encoder(embedded_text, sequence_length=seq_len)
+        fc_out = paddle.tanh(self.fc(text_repr))
+        logits = self.output_layer(fc_out)
+        return logits
+
+
+def rnn_pretrain_forward(train_program, start_program, topo=None):
+    with static.program_guard(train_program,
+                              start_program), paddle.utils.unique_name.guard():
+        batch_size = 1
+        tokens = static.data(
+            name="tokens", shape=[batch_size, -1], dtype="int64")
+        seq_len = static.data(name="ids", shape=[batch_size], dtype="int64")
+        labels = static.data(name="labels", shape=[batch_size], dtype="int64")
+        data_holders = [tokens, seq_len, labels]
+        vocab_size = 10
+        num_classes = 2
+        pad_token_id = 0
+        model = RNNModel(
+            vocab_size,
+            num_classes,
+            direction='forward',
+            padding_idx=pad_token_id,
+            pooling_type='max')
+
+        optimizer = paddle.optimizer.Adam(
+            parameters=model.parameters(), learning_rate=0.001)
+        criterion = paddle.nn.CrossEntropyLoss()
+        preds = model(tokens, seq_len)
+        loss = criterion(preds, labels)
+
+    return train_program, start_program, loss, optimizer, data_holders
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_rnn_raw_optimizer(self):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        train_program = static.Program()
+        start_program = static.Program()
+        train_program, start_program, loss, optimizer, data_holders = \
+            rnn_pretrain_forward(train_program, start_program)
+        with paddle.static.program_guard(
+                train_program, start_program), paddle.utils.unique_name.guard():
+            strategy = fleet.DistributedStrategy()
+            strategy.without_graph_optimization = True
+            strategy.fuse_all_reduce_ops = True
+            fleet.init(is_collective=True, strategy=strategy)
+            optimizer = fleet.distributed_optimizer(optimizer)
+            optimizer.minimize(loss)
+
+
+if __name__ == "__main__":
+    unittest.main()

From a66b9fba3b5ada77ef5c3cc1b8e398395676a730 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Thu, 30 Sep 2021 14:18:24 +0800
Subject: [PATCH 14/80] [NPU] modify transpose2 and index_select_grad kernels
 for model xlnet (#36214)

* [NPU] modify transpose2 and index_select_grad kernels for model xlnet

* add transpose2 int64_t unit test

* add more transpose2 unit tests

* update test_transpose_op_npu.py
---
 paddle/fluid/operators/index_select_op_npu.cc | 17 ++--
 paddle/fluid/operators/transpose_op_npu.cc    | 21 +++-
 .../unittests/npu/test_transpose_op_npu.py    | 98 +++++++++++++++----
 3 files changed, 107 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc
index b624d03cc8555..825229282f3da 100644
--- a/paddle/fluid/operators/index_select_op_npu.cc
+++ b/paddle/fluid/operators/index_select_op_npu.cc
@@ -99,10 +99,11 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
         transed_out_dims[i] = out_dims[in_trans_perm[i]];
       }
       transed_out_grad.mutable_data<T>(transed_out_dims, ctx.GetPlace());
-      framework::NPUAttributeMap in_trans_attr = {{"perm", in_trans_perm}};
-
-      const auto& in_trans_runner = NpuOpRunner(
-          "TransposeD", {*out_grad}, {transed_out_grad}, in_trans_attr);
+      NpuOpRunner in_trans_runner;
+      in_trans_runner.SetType("Transpose")
+          .AddInput(*out_grad)
+          .AddInput(std::move(in_trans_perm))
+          .AddOutput(transed_out_grad);
       in_trans_runner.Run(stream);
 
       Tensor sum_out;
@@ -133,10 +134,12 @@ class IndexSelectGradNPUKernel : public framework::OpKernel<T> {
       for (int i = 1 + dim; i < x_dims.size(); ++i) {
         out_trans_perm.push_back(i);
       }
-      framework::NPUAttributeMap out_trans_attr = {{"perm", out_trans_perm}};
       x_grad->mutable_data<T>(ctx.GetPlace());
-      const auto& out_trans_runner =
-          NpuOpRunner("TransposeD", {sum_out}, {*x_grad}, out_trans_attr);
+      NpuOpRunner out_trans_runner;
+      out_trans_runner.SetType("Transpose")
+          .AddInput(sum_out)
+          .AddInput(std::move(out_trans_perm))
+          .AddOutput(*x_grad);
       out_trans_runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
index 035ad5f3f314a..7cc68e93c5d62 100644
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -27,9 +27,12 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
     auto* x = ctx.Input<framework::LoDTensor>("X");
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    framework::NPUAttributeMap attr_input = {{"perm", axis}};
     out->mutable_data<T>(ctx.device_context().GetPlace());
-    const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    NpuOpRunner runner;
+    runner.SetType("Transpose")
+        .AddInput(*x)
+        .AddInput(std::move(axis))
+        .AddOutput(*out);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -51,9 +54,11 @@ class TransposeGradNPUKernel : public framework::OpKernel<T> {
       reversed_axis[axis[i]] = i;
     }
     x_grad->mutable_data<T>(ctx.GetPlace());
-    framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
-    const auto& runner =
-        NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    NpuOpRunner runner;
+    runner.SetType("Transpose")
+        .AddInput(*out_grad)
+        .AddInput(std::move(reversed_axis))
+        .AddOutput(*x_grad);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -72,11 +77,17 @@ REGISTER_OP_NPU_KERNEL(
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext,
                             paddle::platform::float16>,
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+    ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+#endif
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, uint8_t>,
     ops::TransposeNPUKernel<paddle::platform::NPUDeviceContext, int8_t>);
 
 REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel<float>,
                        ops::TransposeGradNPUKernel<paddle::platform::float16>,
                        ops::TransposeGradNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::TransposeGradNPUKernel<int64_t>,
+#endif
                        ops::TransposeGradNPUKernel<uint8_t>,
                        ops::TransposeGradNPUKernel<int8_t>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index e95f3cc83cfb3..b1a6bfcdaaadc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -31,40 +31,104 @@ def setUp(self):
         self.op_type = "transpose2"
         self.place = paddle.NPUPlace(0)
         self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
+        self.init_shape_axis()
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)}
-        self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'}
-        self.outputs = {'Out': self.out}
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+        self.attrs = {'axis': self.axis, 'data_format': 'AnyLayout'}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
 
     def set_npu(self):
         self.__class__.use_npu = True
 
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype)
-        self.out = np.transpose(self.x, [0, 2, 1, 3])
-
     def init_dtype(self):
         self.dtype = np.float32
 
-    def init_axis(self):
-        self.axis = -1
+    def init_shape_axis(self):
+        self.shape = (3, 40)
+        self.axis = (1, 0)
 
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestCase0(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (100, )
+        self.axis = (0, )
+
+
+class TestCase1(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (3, 4, 10)
+        self.axis = (0, 2, 1)
+
+
+class TestCase2(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+class TestCase5(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 16, 96)
+        self.axis = (0, 2, 1)
 
-class TestTransposeOpFP16(TestTransposeOp):
-    no_need_check_grad = True
 
+class TestCase6(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 10, 12, 16)
+        self.axis = (3, 1, 2, 0)
+
+
+class TestCase7(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 10, 2, 16)
+        self.axis = (0, 1, 3, 2)
+
+
+class TestCase8(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (0, 1, 3, 2, 4, 5, 6, 7)
+
+
+class TestCase9(TestTransposeOp):
+    def init_shape_axis(self):
+        self.shape = (2, 3, 2, 3, 2, 4, 3, 3)
+        self.axis = (6, 1, 3, 5, 0, 2, 4, 7)
+
+
+class TestTransposeOpFP16(TestTransposeOp):
     def init_dtype(self):
         self.dtype = np.float16
 
+    def test_check_grad(self):
+        pass
+
+
+class TestTransposeOpInt64(TestTransposeOp):
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_grad(self):
+        pass
+
 
 if __name__ == '__main__':
     unittest.main()

From 56b04bc19fa68f6767dc83cd26b8b4a35ad69d5e Mon Sep 17 00:00:00 2001
From: levi131 <83750468+levi131@users.noreply.github.com>
Date: Thu, 30 Sep 2021 16:48:01 +0800
Subject: [PATCH 15/80] add test_hessian time out (#36234)

---
 python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
index 1e9d433ebce8e..369134c8989a0 100644
--- a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt
@@ -7,4 +7,4 @@ foreach(TEST_OP ${TEST_OPS})
 endforeach(TEST_OP)
 
 set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20)
-set_tests_properties(test_hessian PROPERTIES TIMEOUT 20)
+set_tests_properties(test_hessian PROPERTIES TIMEOUT 50)

From c12176e88566a97ca0f3efec071eaaebade9cd9a Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Thu, 30 Sep 2021 17:30:34 +0800
Subject: [PATCH 16/80] fix yolo (#36240)

---
 paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index ee1709f57e259..10123cd4fa0e1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -119,10 +119,10 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors,
                                   int img_height, int img_width, float scale,
                                   float bias) {
   box[0] = static_cast<float>(
-      (i + sigmoid(static_cast<float>(x[index]) * scale + bias)) * img_width /
+      (i + sigmoid(static_cast<float>(x[index])) * scale + bias) * img_width /
       grid_size_w);
   box[1] = static_cast<float>(
-      (j + sigmoid(static_cast<float>(x[index + stride]) * scale + bias)) *
+      (j + sigmoid(static_cast<float>(x[index + stride])) * scale + bias) *
       img_height / grid_size_h);
   box[2] = static_cast<float>(expf(static_cast<float>(x[index + 2 * stride])) *
                               anchors[2 * an_idx] * img_width / input_size_w);

From 0a3dbe8a26ae592623002a3eb2d17978c77b919f Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Thu, 30 Sep 2021 18:16:01 +0800
Subject: [PATCH 17/80] add slotrecord datafeed (#36099)

---
 paddle/fluid/framework/data_feed.cc           | 642 ++++++++++++++++++
 paddle/fluid/framework/data_feed.h            |  38 +-
 paddle/fluid/framework/data_feed_factory.cc   |   5 +-
 paddle/fluid/framework/data_set.cc            |  30 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 100 ++-
 paddle/fluid/platform/flags.cc                |   4 +-
 6 files changed, 787 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 4463fd9fd5340..2d089b4721b82 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 
 USE_INT_STAT(STAT_total_feasign_num_in_mem);
+DECLARE_bool(enable_ins_parser_file);
 namespace paddle {
 namespace framework {
 
@@ -1929,5 +1930,646 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector<Record*>& ins_vec) {
 #endif
 }
 
+template class InMemoryDataFeed<SlotRecord>;
+void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) {
+  finish_init_ = false;
+  finish_set_filelist_ = false;
+  finish_start_ = false;
+  PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
+                 platform::errors::PreconditionNotMet(
+                     "Multi_slot_desc has not been set in data_feed_desc"));
+  paddle::framework::MultiSlotDesc multi_slot_desc =
+      data_feed_desc.multi_slot_desc();
+  SetBatchSize(data_feed_desc.batch_size());
+  size_t all_slot_num = multi_slot_desc.slots_size();
+
+  all_slots_.resize(all_slot_num);
+  all_slots_info_.resize(all_slot_num);
+  used_slots_info_.resize(all_slot_num);
+  use_slot_size_ = 0;
+  use_slots_.clear();
+
+  float_total_dims_size_ = 0;
+  float_total_dims_without_inductives_.clear();
+  for (size_t i = 0; i < all_slot_num; ++i) {
+    const auto& slot = multi_slot_desc.slots(i);
+    all_slots_[i] = slot.name();
+
+    AllSlotInfo& all_slot = all_slots_info_[i];
+    all_slot.slot = slot.name();
+    all_slot.type = slot.type();
+    all_slot.used_idx = slot.is_used() ? use_slot_size_ : -1;
+    all_slot.slot_value_idx = -1;
+
+    if (slot.is_used()) {
+      UsedSlotInfo& info = used_slots_info_[use_slot_size_];
+      info.idx = i;
+      info.slot = slot.name();
+      info.type = slot.type();
+      info.dense = slot.is_dense();
+      info.total_dims_without_inductive = 1;
+      info.inductive_shape_index = -1;
+
+      // record float value and uint64_t value pos
+      if (info.type[0] == 'u') {
+        info.slot_value_idx = uint64_use_slot_size_;
+        all_slot.slot_value_idx = uint64_use_slot_size_;
+        ++uint64_use_slot_size_;
+      } else if (info.type[0] == 'f') {
+        info.slot_value_idx = float_use_slot_size_;
+        all_slot.slot_value_idx = float_use_slot_size_;
+        ++float_use_slot_size_;
+      }
+
+      use_slots_.push_back(slot.name());
+
+      if (slot.is_dense()) {
+        for (int j = 0; j < slot.shape_size(); ++j) {
+          if (slot.shape(j) > 0) {
+            info.total_dims_without_inductive *= slot.shape(j);
+          }
+          if (slot.shape(j) == -1) {
+            info.inductive_shape_index = j;
+          }
+        }
+      }
+      if (info.type[0] == 'f') {
+        float_total_dims_without_inductives_.push_back(
+            info.total_dims_without_inductive);
+        float_total_dims_size_ += info.total_dims_without_inductive;
+      }
+      info.local_shape.clear();
+      for (int j = 0; j < slot.shape_size(); ++j) {
+        info.local_shape.push_back(slot.shape(j));
+      }
+      ++use_slot_size_;
+    }
+  }
+  used_slots_info_.resize(use_slot_size_);
+
+  feed_vec_.resize(used_slots_info_.size());
+  const int kEstimatedFeasignNumPerSlot = 5;  // Magic Number
+  for (size_t i = 0; i < all_slot_num; i++) {
+    batch_float_feasigns_.push_back(std::vector<float>());
+    batch_uint64_feasigns_.push_back(std::vector<uint64_t>());
+    batch_float_feasigns_[i].reserve(default_batch_size_ *
+                                     kEstimatedFeasignNumPerSlot);
+    batch_uint64_feasigns_[i].reserve(default_batch_size_ *
+                                      kEstimatedFeasignNumPerSlot);
+    offset_.push_back(std::vector<size_t>());
+    offset_[i].reserve(default_batch_size_ +
+                       1);  // Each lod info will prepend a zero
+  }
+  visit_.resize(all_slot_num, false);
+  pipe_command_ = data_feed_desc.pipe_command();
+  finish_init_ = true;
+  input_type_ = data_feed_desc.input_type();
+  size_t pos = pipe_command_.find(".so");
+  if (pos != std::string::npos) {
+    pos = pipe_command_.rfind('|');
+    if (pos == std::string::npos) {
+      so_parser_name_ = pipe_command_;
+      pipe_command_.clear();
+    } else {
+      so_parser_name_ = pipe_command_.substr(pos + 1);
+      pipe_command_ = pipe_command_.substr(0, pos);
+    }
+    so_parser_name_ = paddle::string::erase_spaces(so_parser_name_);
+  } else {
+    so_parser_name_.clear();
+  }
+}
+
+void SlotRecordInMemoryDataFeed::LoadIntoMemory() {
+  VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_;
+  if (!so_parser_name_.empty()) {
+    LoadIntoMemoryByLib();
+  } else {
+    LoadIntoMemoryByCommand();
+  }
+}
+void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLib(void) {
+  if (true) {
+    // user defined file format analysis
+    LoadIntoMemoryByFile();
+  } else {
+    LoadIntoMemoryByLine();
+  }
+}
+
+void SlotRecordInMemoryDataFeed::LoadIntoMemoryByFile(void) {
+#ifdef _LINUX
+  paddle::framework::CustomParser* parser =
+      global_dlmanager_pool().Load(so_parser_name_, all_slots_info_);
+  CHECK(parser != nullptr);
+  // get slotrecord object
+  auto pull_record_func = [this](std::vector<SlotRecord>& record_vec,
+                                 int max_fetch_num, int offset) {
+    if (offset > 0) {
+      input_channel_->WriteMove(offset, &record_vec[0]);
+      if (max_fetch_num > 0) {
+        SlotRecordPool().get(&record_vec[0], offset);
+      } else {  // free all
+        max_fetch_num = static_cast<int>(record_vec.size());
+        if (max_fetch_num > offset) {
+          SlotRecordPool().put(&record_vec[offset], (max_fetch_num - offset));
+        }
+      }
+    } else if (max_fetch_num > 0) {
+      SlotRecordPool().get(&record_vec, max_fetch_num);
+    } else {
+      SlotRecordPool().put(&record_vec);
+    }
+  };
+
+  std::string filename;
+  while (this->PickOneFile(&filename)) {
+    VLOG(3) << "PickOneFile, filename=" << filename
+            << ", thread_id=" << thread_id_;
+    platform::Timer timeline;
+    timeline.Start();
+
+    int lines = 0;
+    bool is_ok = true;
+    do {
+      int err_no = 0;
+      this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
+
+      CHECK(this->fp_ != nullptr);
+      __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
+      is_ok = parser->ParseFileInstance(
+          [this](char* buf, int len) {
+            return fread(buf, sizeof(char), len, this->fp_.get());
+          },
+          pull_record_func, lines);
+
+      if (!is_ok) {
+        LOG(WARNING) << "parser error, filename=" << filename
+                     << ", lines=" << lines;
+      }
+    } while (!is_ok);
+    timeline.Pause();
+    VLOG(3) << "LoadIntoMemoryByLib() read all file, file=" << filename
+            << ", cost time=" << timeline.ElapsedSec()
+            << " seconds, thread_id=" << thread_id_ << ", lines=" << lines;
+  }
+#endif
+}
+
+void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) {
+#ifdef _LINUX
+  paddle::framework::CustomParser* parser =
+      global_dlmanager_pool().Load(so_parser_name_, all_slots_info_);
+  std::string filename;
+  BufferedLineFileReader line_reader;
+  line_reader.set_sample_rate(sample_rate_);
+  BufferedLineFileReader::LineFunc line_func = nullptr;
+
+  while (this->PickOneFile(&filename)) {
+    VLOG(3) << "PickOneFile, filename=" << filename
+            << ", thread_id=" << thread_id_;
+    std::vector<SlotRecord> record_vec;
+    platform::Timer timeline;
+    timeline.Start();
+    int offset = 0;
+    int old_offset = 0;
+
+    SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
+    // get slotrecord object function
+    auto record_func = [this, &offset, &record_vec, &old_offset](
+        std::vector<SlotRecord>& vec, int num) {
+      vec.resize(num);
+      if (offset + num > OBJPOOL_BLOCK_SIZE) {
+        input_channel_->WriteMove(offset, &record_vec[0]);
+        SlotRecordPool().get(&record_vec[0], offset);
+        record_vec.resize(OBJPOOL_BLOCK_SIZE);
+        offset = 0;
+        old_offset = 0;
+      }
+      for (int i = 0; i < num; ++i) {
+        auto& ins = record_vec[offset + i];
+        ins->reset();
+        vec[i] = ins;
+      }
+      offset = offset + num;
+    };
+
+    line_func = [this, &parser, &record_vec, &offset, &filename, &record_func,
+                 &old_offset](const std::string& line) {
+      old_offset = offset;
+      if (!parser->ParseOneInstance(line, record_func)) {
+        offset = old_offset;
+        LOG(WARNING) << "read file:[" << filename << "] item error, line:["
+                     << line << "]";
+        return false;
+      }
+      if (offset >= OBJPOOL_BLOCK_SIZE) {
+        input_channel_->Write(std::move(record_vec));
+        record_vec.clear();
+        SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
+        offset = 0;
+      }
+      return true;
+    };
+
+    int lines = 0;
+
+    do {
+      int err_no = 0;
+      this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
+      CHECK(this->fp_ != nullptr);
+      __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
+      lines = line_reader.read_file(this->fp_.get(), line_func, lines);
+    } while (line_reader.is_error());
+
+    if (offset > 0) {
+      input_channel_->WriteMove(offset, &record_vec[0]);
+      if (offset < OBJPOOL_BLOCK_SIZE) {
+        SlotRecordPool().put(&record_vec[offset],
+                             (OBJPOOL_BLOCK_SIZE - offset));
+      }
+    } else {
+      SlotRecordPool().put(&record_vec);
+    }
+    record_vec.clear();
+    record_vec.shrink_to_fit();
+    timeline.Pause();
+    VLOG(3) << "LoadIntoMemoryByLib() read all lines, file=" << filename
+            << ", cost time=" << timeline.ElapsedSec()
+            << " seconds, thread_id=" << thread_id_ << ", lines=" << lines
+            << ", sample lines=" << line_reader.get_sample_line()
+            << ", filesize=" << line_reader.file_size() / 1024.0 / 1024.0
+            << "MB";
+  }
+
+  VLOG(3) << "LoadIntoMemoryByLib() end, thread_id=" << thread_id_
+          << ", total size: " << line_reader.file_size();
+#endif
+}
+
+void SlotRecordInMemoryDataFeed::LoadIntoMemoryByCommand(void) {
+#ifdef _LINUX
+  std::string filename;
+  BufferedLineFileReader line_reader;
+  line_reader.set_sample_rate(sample_rate_);
+
+  while (this->PickOneFile(&filename)) {
+    VLOG(3) << "PickOneFile, filename=" << filename
+            << ", thread_id=" << thread_id_;
+    int lines = 0;
+    std::vector<SlotRecord> record_vec;
+    platform::Timer timeline;
+    timeline.Start();
+    SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
+    int offset = 0;
+
+    do {
+      int err_no = 0;
+      this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_);
+      CHECK(this->fp_ != nullptr);
+      __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER);
+
+      lines = line_reader.read_file(
+          this->fp_.get(),
+          [this, &record_vec, &offset, &filename](const std::string& line) {
+            if (ParseOneInstance(line, &record_vec[offset])) {
+              ++offset;
+            } else {
+              LOG(WARNING) << "read file:[" << filename
+                           << "] item error, line:[" << line << "]";
+              return false;
+            }
+            if (offset >= OBJPOOL_BLOCK_SIZE) {
+              input_channel_->Write(std::move(record_vec));
+              record_vec.clear();
+              SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE);
+              offset = 0;
+            }
+            return true;
+          },
+          lines);
+    } while (line_reader.is_error());
+    if (offset > 0) {
+      input_channel_->WriteMove(offset, &record_vec[0]);
+      if (offset < OBJPOOL_BLOCK_SIZE) {
+        SlotRecordPool().put(&record_vec[offset],
+                             (OBJPOOL_BLOCK_SIZE - offset));
+      }
+    } else {
+      SlotRecordPool().put(&record_vec);
+    }
+    record_vec.clear();
+    record_vec.shrink_to_fit();
+    timeline.Pause();
+    VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename
+            << ", lines=" << lines
+            << ", sample lines=" << line_reader.get_sample_line()
+            << ", cost time=" << timeline.ElapsedSec()
+            << " seconds, thread_id=" << thread_id_;
+  }
+  VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_
+          << ", total size: " << line_reader.file_size();
+#endif
+}
+
+static void parser_log_key(const std::string& log_key, uint64_t* search_id,
+                           uint32_t* cmatch, uint32_t* rank) {
+  std::string searchid_str = log_key.substr(16, 16);
+  *search_id = static_cast<uint64_t>(strtoull(searchid_str.c_str(), NULL, 16));
+  std::string cmatch_str = log_key.substr(11, 3);
+  *cmatch = static_cast<uint32_t>(strtoul(cmatch_str.c_str(), NULL, 16));
+  std::string rank_str = log_key.substr(14, 2);
+  *rank = static_cast<uint32_t>(strtoul(rank_str.c_str(), NULL, 16));
+}
+
+bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line,
+                                                  SlotRecord* ins) {
+  SlotRecord& rec = (*ins);
+  // parse line
+  const char* str = line.c_str();
+  char* endptr = const_cast<char*>(str);
+  int pos = 0;
+
+  thread_local std::vector<std::vector<float>> slot_float_feasigns;
+  thread_local std::vector<std::vector<uint64_t>> slot_uint64_feasigns;
+  slot_float_feasigns.resize(float_use_slot_size_);
+  slot_uint64_feasigns.resize(uint64_use_slot_size_);
+
+  if (parse_ins_id_) {
+    int num = strtol(&str[pos], &endptr, 10);
+    CHECK(num == 1);  // NOLINT
+    pos = endptr - str + 1;
+    size_t len = 0;
+    while (str[pos + len] != ' ') {
+      ++len;
+    }
+    rec->ins_id_ = std::string(str + pos, len);
+    pos += len + 1;
+  }
+  if (parse_logkey_) {
+    int num = strtol(&str[pos], &endptr, 10);
+    CHECK(num == 1);  // NOLINT
+    pos = endptr - str + 1;
+    size_t len = 0;
+    while (str[pos + len] != ' ') {
+      ++len;
+    }
+    // parse_logkey
+    std::string log_key = std::string(str + pos, len);
+    uint64_t search_id;
+    uint32_t cmatch;
+    uint32_t rank;
+    parser_log_key(log_key, &search_id, &cmatch, &rank);
+
+    rec->ins_id_ = log_key;
+    rec->search_id = search_id;
+    rec->cmatch = cmatch;
+    rec->rank = rank;
+    pos += len + 1;
+  }
+
+  int float_total_slot_num = 0;
+  int uint64_total_slot_num = 0;
+
+  for (size_t i = 0; i < all_slots_info_.size(); ++i) {
+    auto& info = all_slots_info_[i];
+    int num = strtol(&str[pos], &endptr, 10);
+    PADDLE_ENFORCE(num,
+                   "The number of ids can not be zero, you need padding "
+                   "it in data generator; or if there is something wrong with "
+                   "the data, please check if the data contains unresolvable "
+                   "characters.\nplease check this error line: %s",
+                   str);
+    if (info.used_idx != -1) {
+      if (info.type[0] == 'f') {  // float
+        auto& slot_fea = slot_float_feasigns[info.slot_value_idx];
+        slot_fea.clear();
+        for (int j = 0; j < num; ++j) {
+          float feasign = strtof(endptr, &endptr);
+          if (fabs(feasign) < 1e-6 && !used_slots_info_[info.used_idx].dense) {
+            continue;
+          }
+          slot_fea.push_back(feasign);
+          ++float_total_slot_num;
+        }
+      } else if (info.type[0] == 'u') {  // uint64
+        auto& slot_fea = slot_uint64_feasigns[info.slot_value_idx];
+        slot_fea.clear();
+        for (int j = 0; j < num; ++j) {
+          uint64_t feasign =
+              static_cast<uint64_t>(strtoull(endptr, &endptr, 10));
+          if (feasign == 0 && !used_slots_info_[info.used_idx].dense) {
+            continue;
+          }
+          slot_fea.push_back(feasign);
+          ++uint64_total_slot_num;
+        }
+      }
+      pos = endptr - str;
+    } else {
+      for (int j = 0; j <= num; ++j) {
+        // pos = line.find_first_of(' ', pos + 1);
+        while (line[pos + 1] != ' ') {
+          pos++;
+        }
+      }
+    }
+  }
+  rec->slot_float_feasigns_.add_slot_feasigns(slot_float_feasigns,
+                                              float_total_slot_num);
+  rec->slot_uint64_feasigns_.add_slot_feasigns(slot_uint64_feasigns,
+                                               uint64_total_slot_num);
+
+  return (uint64_total_slot_num > 0);
+}
+
+void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec,
+                                              int num) {
+  for (int j = 0; j < use_slot_size_; ++j) {
+    auto& feed = feed_vec_[j];
+    if (feed == nullptr) {
+      continue;
+    }
+
+    auto& slot_offset = offset_[j];
+    slot_offset.clear();
+    slot_offset.reserve(num + 1);
+    slot_offset.push_back(0);
+
+    int total_instance = 0;
+    auto& info = used_slots_info_[j];
+    // fill slot value with default value 0
+    if (info.type[0] == 'f') {  // float
+      auto& batch_fea = batch_float_feasigns_[j];
+      batch_fea.clear();
+
+      for (int i = 0; i < num; ++i) {
+        auto r = ins_vec[i];
+        size_t fea_num = 0;
+        float* slot_values =
+            r->slot_float_feasigns_.get_values(info.slot_value_idx, &fea_num);
+        batch_fea.resize(total_instance + fea_num);
+        memcpy(&batch_fea[total_instance], slot_values,
+               sizeof(float) * fea_num);
+        total_instance += fea_num;
+        slot_offset.push_back(total_instance);
+      }
+
+      float* feasign = batch_fea.data();
+      float* tensor_ptr =
+          feed->mutable_data<float>({total_instance, 1}, this->place_);
+      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float));
+
+    } else if (info.type[0] == 'u') {  // uint64
+      auto& batch_fea = batch_uint64_feasigns_[j];
+      batch_fea.clear();
+
+      for (int i = 0; i < num; ++i) {
+        auto r = ins_vec[i];
+        size_t fea_num = 0;
+        uint64_t* slot_values =
+            r->slot_uint64_feasigns_.get_values(info.slot_value_idx, &fea_num);
+        if (fea_num > 0) {
+          batch_fea.resize(total_instance + fea_num);
+          memcpy(&batch_fea[total_instance], slot_values,
+                 sizeof(uint64_t) * fea_num);
+          total_instance += fea_num;
+        }
+        if (fea_num == 0) {
+          batch_fea.resize(total_instance + fea_num);
+          batch_fea[total_instance] = 0;
+          total_instance += 1;
+        }
+        slot_offset.push_back(total_instance);
+      }
+
+      // no uint64_t type in paddlepaddle
+      uint64_t* feasign = batch_fea.data();
+      int64_t* tensor_ptr =
+          feed->mutable_data<int64_t>({total_instance, 1}, this->place_);
+      CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t));
+    }
+
+    if (info.dense) {
+      if (info.inductive_shape_index != -1) {
+        info.local_shape[info.inductive_shape_index] =
+            total_instance / info.total_dims_without_inductive;
+      }
+      feed->Resize(framework::make_ddim(info.local_shape));
+    } else {
+      LoD data_lod{slot_offset};
+      feed_vec_[j]->set_lod(data_lod);
+    }
+  }
+}
+
+void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) {
+  SlotRecord& ins = (*rec);
+  if (ins->slot_float_feasigns_.slot_offsets.empty()) {
+    return;
+  }
+  size_t total_value_size = ins->slot_float_feasigns_.slot_values.size();
+  if (float_total_dims_size_ == total_value_size) {
+    return;
+  }
+  int float_slot_num =
+      static_cast<int>(float_total_dims_without_inductives_.size());
+  CHECK(float_slot_num == float_use_slot_size_);
+  std::vector<float> old_values;
+  std::vector<uint32_t> old_offsets;
+  old_values.swap(ins->slot_float_feasigns_.slot_values);
+  old_offsets.swap(ins->slot_float_feasigns_.slot_offsets);
+
+  ins->slot_float_feasigns_.slot_values.resize(float_total_dims_size_);
+  ins->slot_float_feasigns_.slot_offsets.assign(float_slot_num + 1, 0);
+
+  auto& slot_offsets = ins->slot_float_feasigns_.slot_offsets;
+  auto& slot_values = ins->slot_float_feasigns_.slot_values;
+
+  uint32_t offset = 0;
+  int num = 0;
+  uint32_t old_off = 0;
+  int dim = 0;
+
+  for (int i = 0; i < float_slot_num; ++i) {
+    dim = float_total_dims_without_inductives_[i];
+    old_off = old_offsets[i];
+    num = static_cast<int>(old_offsets[i + 1] - old_off);
+    if (num == 0) {
+      // fill slot value with default value 0
+      for (int k = 0; k < dim; ++k) {
+        slot_values[k + offset] = 0.0;
+      }
+    } else {
+      if (num == dim) {
+        memcpy(&slot_values[offset], &old_values[old_off], dim * sizeof(float));
+      } else {
+        // position fea
+        // record position index need fix values
+        int pos_idx = static_cast<int>(old_values[old_off]);
+        for (int k = 0; k < dim; ++k) {
+          if (k == pos_idx) {
+            slot_values[k + offset] = 1.0;
+          } else {
+            slot_values[k + offset] = 0.0;
+          }
+        }
+      }
+    }
+    slot_offsets[i] = offset;
+    offset += dim;
+  }
+  slot_offsets[float_slot_num] = offset;
+  CHECK(float_total_dims_size_ == static_cast<size_t>(offset));
+}
+
+bool SlotRecordInMemoryDataFeed::Start() {
+#ifdef _LINUX
+  this->CheckSetFileList();
+  if (input_channel_->Size() != 0) {
+    std::vector<SlotRecord> data;
+    input_channel_->Read(data);
+  }
+#endif
+  if (batch_offsets_.size() > 0) {
+    VLOG(3) << "batch_size offsets: " << batch_offsets_.size();
+    enable_heterps_ = true;
+    this->offset_index_ = 0;
+  }
+  this->finish_start_ = true;
+  return true;
+}
+
+int SlotRecordInMemoryDataFeed::Next() {
+#ifdef _LINUX
+  this->CheckStart();
+
+  VLOG(3) << "enable heter next: " << offset_index_
+          << " batch_offsets: " << batch_offsets_.size();
+  if (offset_index_ >= batch_offsets_.size()) {
+    VLOG(3) << "offset_index: " << offset_index_
+            << " batch_offsets: " << batch_offsets_.size();
+    return 0;
+  }
+  auto& batch = batch_offsets_[offset_index_++];
+  this->batch_size_ = batch.second;
+  VLOG(3) << "batch_size_=" << this->batch_size_
+          << ", thread_id=" << thread_id_;
+  if (this->batch_size_ != 0) {
+    PutToFeedVec(&records_[batch.first], this->batch_size_);
+  } else {
+    VLOG(3) << "finish reading for heterps, batch size zero, thread_id="
+            << thread_id_;
+  }
+  VLOG(3) << "enable heter next: " << offset_index_
+          << " batch_offsets: " << batch_offsets_.size()
+          << " baych_size: " << this->batch_size_;
+
+  return this->batch_size_;
+#else
+  return 0;
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 5527eaf1f6fa4..a4100e66e7285 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -384,7 +384,7 @@ class CustomParser {
   CustomParser() {}
   virtual ~CustomParser() {}
   virtual void Init(const std::vector<SlotConf>& slots) = 0;
-  virtual bool Init(const std::vector<AllSlotInfo>& slots) = 0;
+  virtual bool Init(const std::vector<AllSlotInfo>& slots);
   virtual void ParseOneInstance(const char* str, Record* instance) = 0;
   virtual bool ParseOneInstance(
       const std::string& line,
@@ -1103,6 +1103,42 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed<Record> {
   virtual void PutToFeedVec(const Record* ins_vec, int num);
 };
 
+class SlotRecordInMemoryDataFeed : public InMemoryDataFeed<SlotRecord> {
+ public:
+  SlotRecordInMemoryDataFeed() {}
+  virtual ~SlotRecordInMemoryDataFeed() {}
+  virtual void Init(const DataFeedDesc& data_feed_desc);
+  virtual void LoadIntoMemory();
+  void ExpandSlotRecord(SlotRecord* ins);
+
+ protected:
+  virtual bool Start();
+  virtual int Next();
+  virtual bool ParseOneInstance(SlotRecord* instance) { return false; }
+  virtual bool ParseOneInstanceFromPipe(SlotRecord* instance) { return false; }
+  // virtual void ParseOneInstanceFromSo(const char* str, T* instance,
+  //                                    CustomParser* parser) {}
+  virtual void PutToFeedVec(const std::vector<SlotRecord>& ins_vec) {}
+
+  virtual void LoadIntoMemoryByCommand(void);
+  virtual void LoadIntoMemoryByLib(void);
+  virtual void LoadIntoMemoryByLine(void);
+  virtual void LoadIntoMemoryByFile(void);
+  virtual void SetInputChannel(void* channel) {
+    input_channel_ = static_cast<ChannelObject<SlotRecord>*>(channel);
+  }
+  bool ParseOneInstance(const std::string& line, SlotRecord* rec);
+  virtual void PutToFeedVec(const SlotRecord* ins_vec, int num);
+  float sample_rate_ = 1.0f;
+  int use_slot_size_ = 0;
+  int float_use_slot_size_ = 0;
+  int uint64_use_slot_size_ = 0;
+  std::vector<AllSlotInfo> all_slots_info_;
+  std::vector<UsedSlotInfo> used_slots_info_;
+  size_t float_total_dims_size_ = 0;
+  std::vector<int> float_total_dims_without_inductives_;
+};
+
 class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
  public:
   PaddleBoxDataFeed() {}
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index ec1b8ec773fa6..e46e4aeb0124c 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -58,8 +58,8 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
     std::string data_feed_class) {
   if (g_data_feed_map.count(data_feed_class) < 1) {
     LOG(WARNING) << "Your DataFeed " << data_feed_class
-                 << "is not supported currently";
-    LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList();
+                 << " is not supported currently";
+    LOG(WARNING) << " Supported DataFeed: " << DataFeedTypeList();
     exit(-1);
   }
   return g_data_feed_map[data_feed_class]();
@@ -68,6 +68,7 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
 REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
+REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 82a39b206e6bd..2a071665b263c 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -1609,7 +1609,35 @@ void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num,
 
 void SlotRecordDataset::PrepareTrain() {
 #ifdef PADDLE_WITH_GLOO
-  return;
+  if (enable_heterps_) {
+    if (input_records_.size() == 0 && input_channel_ != nullptr &&
+        input_channel_->Size() != 0) {
+      input_channel_->ReadAll(input_records_);
+      VLOG(3) << "read from channel to records with records size: "
+              << input_records_.size();
+    }
+    VLOG(3) << "input records size: " << input_records_.size();
+    int64_t total_ins_num = input_records_.size();
+    std::vector<std::pair<int, int>> offset;
+    int default_batch_size =
+        reinterpret_cast<SlotRecordInMemoryDataFeed*>(readers_[0].get())
+            ->GetDefaultBatchSize();
+    VLOG(3) << "thread_num: " << thread_num_
+            << " memory size: " << total_ins_num
+            << " default batch_size: " << default_batch_size;
+    compute_thread_batch_nccl(thread_num_, total_ins_num, default_batch_size,
+                              &offset);
+    VLOG(3) << "offset size: " << offset.size();
+    for (int i = 0; i < thread_num_; i++) {
+      reinterpret_cast<SlotRecordInMemoryDataFeed*>(readers_[i].get())
+          ->SetRecord(&input_records_[0]);
+    }
+    for (size_t i = 0; i < offset.size(); i++) {
+      reinterpret_cast<SlotRecordInMemoryDataFeed*>(
+          readers_[i % thread_num_].get())
+          ->AddBatchOffset(offset[i]);
+    }
+  }
 #else
   PADDLE_THROW(platform::errors::Unavailable(
       "dataset set heterps need compile with GLOO"));
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 784cbc3d90b86..d1e98a711dc9d 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -45,9 +45,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
   platform::Timer timeline;
   timeline.Start();
   int device_num = heter_devices_.size();
-  MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
   gpu_task->init(thread_keys_shard_num_, device_num);
-  auto input_channel = dataset->GetInputChannel();
   auto& local_keys = gpu_task->feature_keys_;
   auto& local_ptr = gpu_task->value_ptr_;
 
@@ -68,35 +66,83 @@ void PSGPUWrapper::BuildTask(std::shared_ptr<HeterContext> gpu_task) {
   for (int i = 0; i < thread_keys_thread_num_; i++) {
     thread_keys_[i].resize(thread_keys_shard_num_);
   }
-  const std::deque<Record>& vec_data = input_channel->GetData();
-  size_t total_len = vec_data.size();
-  size_t len_per_thread = total_len / thread_keys_thread_num_;
-  int remain = total_len % thread_keys_thread_num_;
+
+  size_t total_len = 0;
+  size_t len_per_thread = 0;
+  int remain = 0;
   size_t begin = 0;
-  auto gen_func = [this](const std::deque<Record>& total_data, int begin_index,
-                         int end_index, int i) {
-    for (auto iter = total_data.begin() + begin_index;
-         iter != total_data.begin() + end_index; iter++) {
-      const auto& ins = *iter;
-      const auto& feasign_v = ins.uint64_feasigns_;
-      for (const auto feasign : feasign_v) {
-        uint64_t cur_key = feasign.sign().uint64_feasign_;
-        int shard_id = cur_key % thread_keys_shard_num_;
-        this->thread_keys_[i][shard_id].insert(cur_key);
+
+  std::string data_set_name = std::string(typeid(*dataset_).name());
+
+  if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
+    VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset";
+    SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_);
+    auto input_channel = dataset->GetInputChannel();
+    VLOG(0) << "yxf::buildtask::inputslotchannle size: "
+            << input_channel->Size();
+    const std::deque<SlotRecord>& vec_data = input_channel->GetData();
+    total_len = vec_data.size();
+    len_per_thread = total_len / thread_keys_thread_num_;
+    remain = total_len % thread_keys_thread_num_;
+    VLOG(0) << "total len: " << total_len;
+    auto gen_func = [this](const std::deque<SlotRecord>& total_data,
+                           int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values;
+        for (const auto feasign : feasign_v) {
+          int shard_id = feasign % thread_keys_shard_num_;
+          this->thread_keys_[i][shard_id].insert(feasign);
+        }
       }
+    };
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      threads.push_back(
+          std::thread(gen_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+      begin += len_per_thread + (i < remain ? 1 : 0);
     }
-  };
-  for (int i = 0; i < thread_keys_thread_num_; i++) {
-    threads.push_back(std::thread(gen_func, std::ref(vec_data), begin,
-                                  begin + len_per_thread + (i < remain ? 1 : 0),
-                                  i));
-    begin += len_per_thread + (i < remain ? 1 : 0);
-  }
-  for (std::thread& t : threads) {
-    t.join();
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    timeline.Pause();
+    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
+  } else {
+    CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
+    VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
+    MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_);
+    auto input_channel = dataset->GetInputChannel();
+
+    const std::deque<Record>& vec_data = input_channel->GetData();
+    total_len = vec_data.size();
+    len_per_thread = total_len / thread_keys_thread_num_;
+    remain = total_len % thread_keys_thread_num_;
+    auto gen_func = [this](const std::deque<Record>& total_data,
+                           int begin_index, int end_index, int i) {
+      for (auto iter = total_data.begin() + begin_index;
+           iter != total_data.begin() + end_index; iter++) {
+        const auto& ins = *iter;
+        const auto& feasign_v = ins.uint64_feasigns_;
+        for (const auto feasign : feasign_v) {
+          uint64_t cur_key = feasign.sign().uint64_feasign_;
+          int shard_id = cur_key % thread_keys_shard_num_;
+          this->thread_keys_[i][shard_id].insert(cur_key);
+        }
+      }
+    };
+    for (int i = 0; i < thread_keys_thread_num_; i++) {
+      threads.push_back(
+          std::thread(gen_func, std::ref(vec_data), begin,
+                      begin + len_per_thread + (i < remain ? 1 : 0), i));
+      begin += len_per_thread + (i < remain ? 1 : 0);
+    }
+    for (std::thread& t : threads) {
+      t.join();
+    }
+    timeline.Pause();
+    VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
   }
-  timeline.Pause();
-  VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds.";
 
   timeline.Start();
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 72b95dcc15346..7a7666665511f 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -687,4 +687,6 @@ DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
 DEFINE_bool(enable_slotpool_wait_release, false,
             "enable slotrecord obejct wait release, default false");
 DEFINE_bool(enable_slotrecord_reset_shrink, false,
-            "enable slotrecord obejct reset shrink memory, default false");
\ No newline at end of file
+            "enable slotrecord obejct reset shrink memory, default false");
+DEFINE_bool(enable_ins_parser_file, false,
+            "enable parser ins file , default false");

From 2cee0ea7b26cb71fc4d06f5074d57f457a7db1f1 Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Mon, 4 Oct 2021 09:49:48 +0200
Subject: [PATCH 18/80] added Piotr to authors.md and updated Intel-related
 paddle authors image (#36254)

---
 AUTHORS.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/AUTHORS.md b/AUTHORS.md
index 1eaaff2977143..60f5b424abb7a 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -3,7 +3,7 @@
 | abhinavarora | Abhinav Arora |
 | andreazanetti | Andrea Zanetti |
 | arlesniak | Artur Lesniak |
-| arogowie-intel | Adam Osewski |
+| [arogowie-intel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Adam Osewski |
 | backyes | Yan-Fei Wang |
 | baiyfbupt | Yi-Fan Bai |
 | beckett1124 | Bin Qi |
@@ -25,8 +25,8 @@
 | hedaoyuan | Dao-Yuan He |
 | helinwang | He-Lin Wang |
 | jacquesqiao | Long-Fei Qiao |
-| jakpiase | Jakub Piasecki |
-| [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja |
+| [jakpiase](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jakub Piasecki |
+| [jczaja](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jacek Czaja |
 | JiayiFeng | Jia-Yi Feng |
 | kbinias | Krzysztof Binias |
 | kexinzhao | Ke-Xin Zhao |
@@ -47,7 +47,8 @@
 | pakchoi | Chuan-Jiang Song |
 | panyx0718 | Xin Pan |
 | pengli09 | Peng Li |
-| pmajchrzak |Piotr Majchrzak |
+| [piotrekobiIntel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Paturej |
+| [pmajchrzak](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Majchrzak |
 | pkuyym | Ya-Ming Yang |
 | pzelazko-intel | Pawel Zelazko |
 | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)  | Pawel Piotrowicz |
@@ -55,12 +56,13 @@
 | qingqing01 | Qing-Qing Dang |
 | reyoung | Yang Yu |
 | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus |
-| [sfraczek](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Sylwester Fraczek |
+| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek |
 | sneaxiy | Jin-Le Zeng |
 | Superjom | Chun-Wei Yan |
 | tensor-tang | Jian Tang |
 | tianbingsz | Tian-Bing Xu |
 | tpatejko | Tomasz Patejko |
+| [tsocha](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Tomasz Socha |
 | typhoonzero | Yi Wu |
 | velconia | Qi-Yang Min |
 | wanghaoshuang | Hao-Shuang Wang |
@@ -68,7 +70,7 @@
 | wangzhen-nlp | Zhen Wang |
 | wen-bo-yang | Wen-Bo Yang |
 | wojtuss | Wojciech Uss |
-| wozna | Joanna Wozna |
+| [wozna](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Joanna Wozna |
 | wwhu | Wei-Wei Hu |
 | xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |

From dc4d5719060aac5aaaec868c1c055cd27f8e812a Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Tue, 5 Oct 2021 13:38:19 +0200
Subject: [PATCH 19/80] Added concat BF16/FP32 BWD OneDNN kernel (#35889)

* tmp

* added concat BF16/FP32 BWD oneDNN kernel

* minor change

* minor change

* fix for CI

* added formatting

* Reverted deleting static keyword

* added reviewers suggestions

* reverted deleting concat bf16 test file

* fixed concat tests
---
 paddle/fluid/operators/concat_op.cc           |  18 ++-
 .../operators/mkldnn/concat_mkldnn_op.cc      |  71 +++++++++++
 .../mkldnn/test_concat_bf16_mkldnn_op.py      |  27 ++++-
 .../unittests/mkldnn/test_concat_mkldnn_op.py | 114 ++++++++++--------
 .../fluid/tests/unittests/test_concat_op.py   |   2 +-
 5 files changed, 171 insertions(+), 61 deletions(-)

diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index a400d27b798e3..e6b1f6a1c18c3 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
-                                       ctx, framework::GradVarName("Out")),
-                                   ctx.GetPlace());
+    auto input_data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+
+#ifdef PADDLE_WITH_MKLDNN
+    // extra checking if attr "use_mkldnn" exist is needed because
+    // test_reverse_op is calling concat_grad kernel without setting
+    // "use_mkldnn" to any value
+    if (ctx.HasAttr("use_mkldnn") &&
+        this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
   framework::OpKernelType GetKernelTypeForVar(
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index 57a56776736ff..4cc96a48bd26f 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -23,6 +23,7 @@ namespace operators {
 
 using framework::DataLayout;
 using framework::Tensor;
+using framework::LoDTensor;
 using mkldnn::memory;
 using mkldnn::primitive;
 using mkldnn::concat;
@@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_format(platform::GetMKLDNNFormat(*dst_mem));
   }
 };
+
+template <typename T>
+class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& onednn_engine = dev_ctx.GetEngine();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
+
+    const auto x = ctx.MultiInput<LoDTensor>("X");
+    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto dx = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
+
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (dx[i] != nullptr) {
+        dx[i]->set_lod(x[i]->lod());
+      }
+    }
+
+    int axis = ctx.Attr<int>("axis");
+    if (ctx.HasInput("AxisTensor")) {
+      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
+      axis = GetDataFromTensor<int>(axis_tensor)[0];
+    }
+
+    auto dout_vec_dims = framework::vectorize(dout->dims());
+
+    axis = ComputeAxis(axis, dout_vec_dims.size());
+
+    std::vector<int64_t> offset(dout_vec_dims.size(), 0);
+
+    mkldnn::memory::data_type dout_type =
+        framework::ToMKLDNNDataType(dout->type());
+    platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(),
+                                                   dout_type, onednn_engine);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->format(), platform::to_void_cast(dout->data<T>()));
+
+    for (size_t i = 0; i < dx.size(); ++i) {
+      if (out_var_names[i] != framework::kEmptyVarName &&
+          dx[i]->numel() != 0UL) {
+        auto dx_vec_dims = framework::vectorize(dx[i]->dims());
+        auto slice_mem_p = reorder_handler.AcquireSubmemory(
+            dx_vec_dims, offset, reorder_src_memory_p);
+
+        auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+            dx[i], dx_vec_dims, dout->format(), ctx.GetPlace());
+        auto reorder_p =
+            reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+
+        reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+        offset[axis] += dx[i]->dims()[axis];
+
+        dx[i]->set_layout(framework::DataLayout::kMKLDNN);
+        dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p));
+      }
+    }
+    astream.wait();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
                    ops::ConcatMKLDNNOpKernel<int8_t>,
                    ops::ConcatMKLDNNOpKernel<uint8_t>);
+
+REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConcatGradMKLDNNOpKernel<float>,
+                   ops::ConcatGradMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
index 2b7b2b36afa4f..e53afaa57be1c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -40,13 +40,28 @@ def setUp(self):
             'mkldnn_data_type': self.mkldnn_data_type
         }
 
+        self.sections = [self.x0.shape[self.axis]] * 2
+        self.sections[1] += self.x1.shape[self.axis]
+
         self.output = np.concatenate(
             (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
         self.outputs = {'Out': self.output}
 
+    def calculate_grads(self):
+        self.dout = self.outputs['Out']
+        self.dxs = np.split(self.dout, self.sections, self.axis)
+
     def test_check_output(self):
         self.check_output_with_place(core.CPUPlace())
 
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["x0", "x1", "x2"],
+            "Out",
+            user_defined_grads=[self.dxs[0], self.dxs[1], self.dxs[2]],
+            user_defined_grad_outputs=[self.dout])
+
 # --------------------test concat bf16 in with axis 0--------------------
 
     def init_test_data(self):
@@ -61,9 +76,9 @@ def init_axis(self):
         self.axis = 0
 
     def init_shape(self):
-        self.x0_shape = [2, 2, 1, 2]
-        self.x1_shape = [1, 2, 1, 2]
-        self.x2_shape = [3, 2, 1, 2]
+        self.x0_shape = [6, 2, 4, 3]
+        self.x1_shape = [7, 2, 4, 3]
+        self.x2_shape = [8, 2, 4, 3]
 
 
 # --------------------test concat bf16 in with axis 1--------------------
@@ -74,9 +89,9 @@ def init_axis(self):
         self.axis = 1
 
     def init_shape(self):
-        self.x0_shape = [1, 1, 5, 5]
-        self.x1_shape = [1, 2, 5, 5]
-        self.x2_shape = [1, 3, 5, 5]
+        self.x0_shape = [1, 4, 5, 5]
+        self.x1_shape = [1, 8, 5, 5]
+        self.x2_shape = [1, 6, 5, 5]
 
 
 # --------------------test concat bf16 in with axis 2--------------------
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
index 4900b42d3618d..7fc8f1d30802c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py
@@ -15,78 +15,90 @@
 from __future__ import print_function
 
 import unittest
-from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4
+import numpy as np
+import struct
 
-
-class TestMKLDNNConcatOp(TestConcatOp):
-    def setUp(self):
-        super(TestMKLDNNConcatOp, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
-
-    def test_check_grad(self):
-        pass
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
 
 
-class TestMKLDNNConcatOp2(TestConcatOp2):
+class TestConcatAxis0OneDNNOp(OpTest):
     def setUp(self):
-        super(TestMKLDNNConcatOp2, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+        self.op_type = "concat"
+        self.mkldnn_data_type = "float32"
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.configure_datatype()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {
+            'axis': self.axis,
+            'use_mkldnn': True,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype)
+
+        self.outputs = {'Out': self.output}
+
+    def configure_datatype(self):
+        self.mkldnn_data_type = "float32"
+        self.dtype = np.float32
 
     def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
+        self.check_output_with_place(core.CPUPlace())
 
     def test_check_grad(self):
-        pass
+        self.check_grad(['x0'], 'Out')
+        self.check_grad(['x1'], 'Out')
+        self.check_grad(['x2'], 'Out')
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+    def init_test_data(self):
+        self.x0 = np.random.random(self.x0_shape).astype(np.float32)
+        self.x1 = np.random.random(self.x1_shape).astype(np.float32)
+        self.x2 = np.random.random(self.x2_shape).astype(np.float32)
 
+    def init_axis(self):
+        self.axis = 0
 
-class TestMKLDNNConcatOp3(TestConcatOp3):
-    def setUp(self):
-        super(TestMKLDNNConcatOp3, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 50]
+        self.x1_shape = [1, 2, 1, 50]
+        self.x2_shape = [3, 2, 1, 50]
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
 
-    def test_check_grad(self):
-        pass
+class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 1
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+    def init_shape(self):
+        self.x0_shape = [1, 1, 5, 50]
+        self.x1_shape = [1, 2, 5, 50]
+        self.x2_shape = [1, 3, 5, 50]
 
 
-class TestMKLDNNConcatOp4(TestConcatOp4):
-    def setUp(self):
-        super(TestMKLDNNConcatOp4, self).setUp()
-        self.attrs["use_mkldnn"] = True
-        self._cpu_only = True
+class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 2
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False))
+    def init_shape(self):
+        self.x0_shape = [2, 3, 4, 50]
+        self.x1_shape = [2, 3, 5, 50]
+        self.x2_shape = [2, 3, 6, 50]
 
-    def test_check_grad(self):
-        pass
 
-    def init_kernel_type(self):
-        self.use_mkldnn = True
+class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp):
+    def init_axis(self):
+        self.axis = 3
+
+    def init_shape(self):
+        self.x0_shape = [5, 3, 5, 5]
+        self.x1_shape = [5, 3, 5, 6]
+        self.x2_shape = [5, 3, 5, 7]
 
 
 if __name__ == '__main__':
-    from paddle import enable_static
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 10cd774ce04be..5f936e577a06f 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -16,7 +16,7 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard, core
 import paddle

From e928834040fdb606fe56ba74769856b492cd9b79 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 7 Oct 2021 11:43:43 +0200
Subject: [PATCH 20/80] [OneDNN] Conv op refactor. (#36252)

* Remove unused header.

* Use ConvMKLDNNHandlerT for conv2d INT8.

* Use absolute module path to import.
---
 paddle/fluid/operators/mkldnn/axpy_handler.cc |   1 -
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 668 ++++++------------
 paddle/fluid/platform/mkldnn_helper.h         |   6 +
 paddle/fluid/platform/mkldnn_reuse.h          | 568 +--------------
 .../fluid/tests/unittests/test_conv2d_op.py   |   3 +-
 .../unittests/test_conv2d_transpose_op.py     |   2 +-
 6 files changed, 251 insertions(+), 997 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
index ed265edf003e0..db1127b055c31 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 1b69dd7ea00c7..c663ba2f88680 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -12,27 +12,16 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/fluid/framework/data_layout_transform.h"
+#include <tuple>
+
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
-namespace paddle {
-namespace platform {
-class MKLDNNDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
-
-using framework::DataLayout;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using platform::GetMKLDNNFormat;
-using platform::to_void_cast;
+namespace {
 
 inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format,
                                            const int groups,
@@ -78,7 +67,7 @@ class ConvMKLDNNHandlerT
                                       mkldnn::convolution_backward_data,
                                       mkldnn::convolution_backward_weights> {
  public:
-  ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx,
+  ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx,
                      const platform::MKLDNNDeviceContext& dev_ctx,
                      const mkldnn::engine mkldnn_engine,
                      platform::Place cpu_place, const Tensor* input,
@@ -92,19 +81,19 @@ class ConvMKLDNNHandlerT
                                 unique_name)) {
     if (!this->isCached()) {
       PADDLE_ENFORCE_EQ(
-          input->layout(), DataLayout::kMKLDNN,
+          input->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
               "The input tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, input->layout()));
+              framework::DataLayout::kMKLDNN, input->layout()));
       PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
                             "Wrong format set for Input tensor"));
 
       PADDLE_ENFORCE_EQ(
-          filter->layout(), DataLayout::kMKLDNN,
+          filter->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
               "The Filter tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, filter->layout()));
+              framework::DataLayout::kMKLDNN, filter->layout()));
       PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
                             "Wrong format set for Filter tensor"));
@@ -137,10 +126,10 @@ class ConvMKLDNNHandlerT
 
       if (bias) {
         PADDLE_ENFORCE_EQ(
-            bias->layout(), DataLayout::kMKLDNN,
+            bias->layout(), framework::DataLayout::kMKLDNN,
             platform::errors::InvalidArgument(
                 "The Bias tensor's layout should be %d, but got %d.",
-                DataLayout::kMKLDNN, bias->layout()));
+                framework::DataLayout::kMKLDNN, bias->layout()));
         PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
                           platform::errors::InvalidArgument(
                               "Got wrong format for Bias tensor."));
@@ -188,12 +177,12 @@ class ConvMKLDNNHandlerT
       std::transform(dilations.begin(), dilations.end(), dilations.begin(),
                      [](int64_t i) { return i - 1; });
 
-      const auto src_tz = paddle::framework::vectorize(input->dims());
+      const auto src_tz = framework::vectorize(input->dims());
 
-      auto weights_tz = paddle::framework::vectorize(filter->dims());
+      auto weights_tz = framework::vectorize(filter->dims());
       platform::GetGroupConvWeightsTz(weights_tz, groups);
 
-      const auto dst_tz = paddle::framework::vectorize(output->dims());
+      const auto dst_tz = framework::vectorize(output->dims());
 
       const mkldnn::memory::dims stride_dims = strides;
       const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
@@ -204,29 +193,48 @@ class ConvMKLDNNHandlerT
        * the memory format preferred for best performance
        */
       auto chosen_memory_format = MKLDNNMemoryFormat::any;
-
       auto data_type = mkldnn::memory::data_type::f32;
       if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
           std::is_same<T_out, platform::bfloat16>::value)
         data_type = mkldnn::memory::data_type::bf16;
 
-      const auto src_md =
-          platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
-      const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type,
-                                                      MKLDNNMemoryFormat::any);
+      mkldnn::memory::desc src_md, weights_md;
+      if (platform::is_int8<T>()) {
+        src_md = platform::MKLDNNMemDesc(
+            src_tz, framework::ToMKLDNNDataType(input->type()),
+            chosen_memory_format);
+        weights_md = platform::MKLDNNMemDesc(
+            weights_tz, mkldnn::memory::data_type::s8, chosen_memory_format);
+      } else {
+        src_md =
+            platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format);
+        weights_md = platform::MKLDNNMemDesc(weights_tz, data_type,
+                                             MKLDNNMemoryFormat::any);
+      }
+
       const auto dst_md = platform::MKLDNNMemDesc(
           dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
-
       const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
                                          : mkldnn::prop_kind::forward_training;
 
+      float sum_scale;
+      std::vector<float> output_shift_scale;
+      std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx);
+
       const mkldnn::primitive_attr conv_attr = CreatePostOps(
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn);
+          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
+          output_shift_scale, sum_scale);  // for INT8 only!
 
       if (bias) {
         auto bias_tz = framework::vectorize(bias->dims());
-        auto bias_md =
-            platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
+        mkldnn::memory::desc bias_md;
+        if (platform::is_int8<T>()) {
+          bias_md = platform::MKLDNNMemDesc(
+              bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x);
+        } else {
+          bias_md = platform::MKLDNNMemDesc(bias_tz, data_type,
+                                            MKLDNNMemoryFormat::x);
+        }
 
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
@@ -255,28 +263,28 @@ class ConvMKLDNNHandlerT
                                 unique_name)) {
     if (!this->isBwdCached()) {
       PADDLE_ENFORCE_EQ(
-          in->layout(), DataLayout::kMKLDNN,
+          in->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
               "The input tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, in->layout()));
+              framework::DataLayout::kMKLDNN, in->layout()));
       PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
                             "Got wrong format for Input tensor."));
 
       PADDLE_ENFORCE_EQ(
-          filter->layout(), DataLayout::kMKLDNN,
+          filter->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
               "The filter tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, filter->layout()));
+              framework::DataLayout::kMKLDNN, filter->layout()));
       PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
                             "Got wrong format for Filter tensor."));
 
       PADDLE_ENFORCE_EQ(
-          out_grad->layout(), DataLayout::kMKLDNN,
+          out_grad->layout(), framework::DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
               "The output_grad tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, out_grad->layout()));
+              framework::DataLayout::kMKLDNN, out_grad->layout()));
       PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
                         platform::errors::InvalidArgument(
                             "Wrong format set for output_grad tensor"));
@@ -296,28 +304,25 @@ class ConvMKLDNNHandlerT
       std::vector<int64_t> dilations(begin(dilations_temp),
                                      end(dilations_temp));
 
-      std::string padding_algorithm =
-          ctx.Attr<std::string>("padding_algorithm");
-
-      int groups = ctx.Attr<int>("groups");
-
       auto input_dims = in->dims();
       auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
       auto filter_dims = filter->dims();
       auto filter_data_dims =
           framework::slice_ddim(filter_dims, 2, filter_dims.size());
-
       auto ksize = framework::vectorize(filter_data_dims);
 
+      std::string padding_algorithm =
+          ctx.Attr<std::string>("padding_algorithm");
       UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
                                data_dims, strides, ksize);
 
       auto src_tz = framework::vectorize(in->dims());
       auto weights_tz = framework::vectorize(filter->dims());
 
+      int groups = ctx.Attr<int>("groups");
       int g = std::max(groups, 1);
       platform::GetGroupConvWeightsTz(weights_tz, g);
-      auto dst_tz = paddle::framework::vectorize(out_grad->dims());
+      auto dst_tz = framework::vectorize(out_grad->dims());
 
       /* create memory descriptor for conv backward without specified format
        * ('any') which lets a primitive (conv backward in this case) choose
@@ -349,8 +354,14 @@ class ConvMKLDNNHandlerT
       mkldnn::primitive_attr conv_attr;
       if (bias) {
         auto bias_tz = framework::vectorize(bias->dims());
-        auto bias_md = platform::MKLDNNMemDesc(
-            bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x);
+        mkldnn::memory::desc bias_md;
+        if (platform::is_int8<T>()) {
+          bias_md = platform::MKLDNNMemDesc(
+              bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x);
+        } else {
+          bias_md = platform::MKLDNNMemDesc(
+              bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x);
+        }
 
         this->AcquireForwardPrimitiveDescriptor(
             conv_attr, mkldnn::prop_kind::forward_training,
@@ -377,6 +388,71 @@ class ConvMKLDNNHandlerT
     }
   }
 
+  std::tuple<float, std::vector<float>> get_int8_scales(
+      const framework::ExecutionContext& ctx) const {
+    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto& weights_tz = framework::vectorize(filter->dims());
+
+    const bool& force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    const bool& fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    const int groups = std::max(ctx.Attr<int>("groups"), 1);
+
+    const auto& scale_in_data = ctx.Attr<float>("Scale_in");
+    const auto& scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
+    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+    bool is_multi_channel = scale_weights_data.size() > 1;
+    auto scale_out_data =
+        force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
+    float sum_scale =
+        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
+    int count =
+        is_multi_channel
+            ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
+            : 1;
+    std::vector<float> output_shift_scale(count);
+
+#pragma omp parallel for if (count > 50)
+    for (int i = 0; i < count; i++) {
+      if (scale_weights_data[i] == 0.0)
+        // weights data will contain 0 in some models, then weights
+        // scale couldn't be calculated
+        output_shift_scale[i] = scale_out_data;
+      else
+        output_shift_scale[i] =
+            static_cast<float>(static_cast<double>(scale_out_data) /
+                               (static_cast<double>(scale_in_data) *
+                                static_cast<double>(scale_weights_data[i])));
+    }
+
+    return std::make_tuple(sum_scale, output_shift_scale);
+  }
+
+  std::tuple<float, std::vector<float>> get_int8_bias_scales(
+      const framework::ExecutionContext& ctx) const {
+    const auto* filter = ctx.Input<Tensor>("Filter");
+    const auto& weights_tz = framework::vectorize(filter->dims());
+    const int groups = std::max(ctx.Attr<int>("groups"), 1);
+
+    const auto& scale_weights_data =
+        ctx.Attr<std::vector<float>>("Scale_weights");
+    const auto& scale_in_data = ctx.Attr<float>("Scale_in");
+
+    bool is_multi_channel = scale_weights_data.size() > 1;
+    int mask_reorder = is_multi_channel ? 1 << 0 : 1;
+    int count =
+        is_multi_channel
+            ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
+            : 1;
+    std::vector<float> scale_bias_data(count);
+
+#pragma omp parallel for if (count > 50)
+    for (int i = 0; i < count; i++) {
+      scale_bias_data[i] = scale_in_data * scale_weights_data[i];
+    }
+
+    return std::make_tuple(mask_reorder, scale_bias_data);
+  }
+
   mkldnn::primitive_attr CreatePostOps(
       std::string fuse_activation, float fuse_alpha, float fuse_beta,
       bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
@@ -433,7 +509,7 @@ class ConvMKLDNNHandlerT
 
     return this->AcquireMemoryWithReorder(
         user_src_md, this->bwd_pd_->weights_desc(),
-        to_void_cast<K>(filter_data), "@weights_mem_d_p", false);
+        platform::to_void_cast<K>(filter_data), "@weights_mem_d_p", false);
   }
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryWithReorder(
@@ -480,11 +556,11 @@ class ConvMKLDNNHandlerT
           framework::vectorize(in_mem->dims()),
           platform::MKLDNNGetDataType<T>(), in_mem->format());
       return this->AcquireMemoryWithReorder(
-          user_mem_md, mem_md, to_void_cast<T>(in_mem_data), key_mem);
+          user_mem_md, mem_md, platform::to_void_cast<T>(in_mem_data), key_mem);
     } else {
       const std::string target_key_suffix{key_mem_target};
       const auto target_mem_p = this->AcquireMemory(target_key_suffix);
-      user_mem_p->set_data_handle(to_void_cast<T>(in_mem_data));
+      user_mem_p->set_data_handle(platform::to_void_cast<T>(in_mem_data));
       if (user_mem_p != target_mem_p) {
         this->AcquireReorder(user_mem_p, target_mem_p, key_mem);
       }
@@ -494,7 +570,8 @@ class ConvMKLDNNHandlerT
 
   std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryWithReorder(
       const framework::Tensor* filter, const int groups, const bool is_conv3d,
-      const bool is_test) {
+      const bool is_test, const std::vector<float>& scale_data = {1.0f},
+      int mask = 0) {
     // This is workaround to make execution faster, delete
     // if statement after including md inside Tensor
     auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target");
@@ -511,12 +588,14 @@ class ConvMKLDNNHandlerT
 
       return this->AcquireMemoryWithReorder(
           user_src_md, this->fwd_pd_->weights_desc(),
-          to_void_cast<K>(filter_data), "@weights_mem_p", is_test);
+          platform::to_void_cast<K>(filter_data), "@weights_mem_p", is_test, {},
+          scale_data, mask);
     }
   }
 
   std::shared_ptr<mkldnn::memory> AcquireBiasMemoryWithReorder(
-      const framework::Tensor* bias, const bool is_test) {
+      const framework::Tensor* bias, const bool is_test,
+      const std::vector<float>& scale_data = {1.0f}, int mask = 0) {
     auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target");
     if (is_test && bias_mem_p) {
       return bias_mem_p;
@@ -527,8 +606,9 @@ class ConvMKLDNNHandlerT
           MKLDNNMemoryFormat::x);
 
       return this->AcquireMemoryWithReorder(
-          user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast<K>(bias_data),
-          "@bias_mem_p", is_test);
+          user_bias_md, this->fwd_pd_->bias_desc(),
+          platform::to_void_cast<K>(bias_data), "@bias_mem_p", is_test, {},
+          scale_data, mask);
     }
   }
 
@@ -536,8 +616,8 @@ class ConvMKLDNNHandlerT
       const framework::Tensor* residual_param) {
     void* residual_data =
         residual_param->type() == framework::DataTypeTrait<T_out>::DataType()
-            ? to_void_cast<T_out>(residual_param->data<T_out>())
-            : to_void_cast<T>(residual_param->data<T>());
+            ? platform::to_void_cast<T_out>(residual_param->data<T_out>())
+            : platform::to_void_cast<T>(residual_param->data<T>());
     auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p");
     if (residual_mem_p) {
       residual_mem_p->set_data_handle(residual_data);
@@ -572,12 +652,14 @@ class ConvMKLDNNHandlerT
   }
 };
 
+}  // anonymous namespace
+
 template <typename T, typename K>
-class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      platform::errors::PreconditionNotMet(
                           "Operator DNNL Conv must use CPUPlace"));
     bool is_INT8 =
         std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
@@ -607,9 +689,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 
   template <typename T_out>
-  void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
+  void ComputeFP32(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
     const bool is_test = ctx.Attr<bool>("is_test");
@@ -656,407 +738,112 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     conv_p->execute(astream, args);
     astream.wait();
 
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+    output->set_layout(framework::DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
   }
 
   template <typename T_out>
-  void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
-    const bool is_test = ctx.Attr<bool>("is_test");
-
+  void ComputeINT8(const framework::ExecutionContext& ctx) const {
     auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto* input = ctx.Input<Tensor>("Input");
-    auto* output = ctx.Output<Tensor>("Output");
-
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's layout should be %d, but got %d.",
-                          DataLayout::kMKLDNN, input->layout()));
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Got wrong format for Input tensor."));
-
-    PADDLE_ENFORCE_GE(input->dims().size(), 4,
-                      platform::errors::InvalidArgument(
-                          "Input must be with 4 or 5 dimensions, i.e. NCHW or "
-                          "NCDHW, but got dimension = %d .",
-                          input->dims().size()));
-    PADDLE_ENFORCE_LE(input->dims().size(), 5,
-                      platform::errors::InvalidArgument(
-                          "Input must be with 4 or 5 dimensions, i.e. NCHW or "
-                          "NCDHW, but got dimension = %d .",
-                          input->dims().size()));
+    const std::string& fuse_activation =
+        ctx.Attr<std::string>("fuse_activation");
+    const bool& fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+    const bool& force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    const bool is_conv3d = ctx.Attr<std::vector<int>>("strides").size() == 3U;
 
-    std::string fuse_activation = ctx.Attr<std::string>("fuse_activation");
-    bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
     bool unsigned_output =
         (fuse_activation == "relu" || fuse_activation == "relu6");
-
-    const T* input_data = input->data<T>();
-
-    auto src_tz = paddle::framework::vectorize(input->dims());
-
-    mkldnn::memory::data_type src_dt =
-        paddle::framework::ToMKLDNNDataType(input->type());
-
-    std::string key =
-        platform::CreateKey(dev_ctx, src_tz, src_dt,
-                            ctx.InputName("Input") + ctx.InputName("Filter"));
-
     bool need_s8_to_u8 = false;
-    std::shared_ptr<mkldnn::convolution_forward> conv_p;
-    std::shared_ptr<mkldnn::memory> src_memory_p;
-    std::shared_ptr<mkldnn::memory> user_src_memory_p;
-    std::shared_ptr<mkldnn::memory> dst_memory_p;
-    std::vector<primitive> pipeline;
-    std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
-    std::shared_ptr<platform::ConvMKLDNNHandler> handler;
-
-    // This is workaround for hacky implementation
-    // of conv int8 mkl-dnn. Once conv fp32 and conv int8
-    // are merged/unified, this will disappear
-    auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-
-    const std::string key_conv_pd = key_tid + "@conv_pd";
-    auto prim_key = key_tid + "@conv_p";
-    auto dst_key = key_tid + "@dst_mem_p";
-    auto src_key = key_tid + "@src_mem_p";
-    auto weights_key = key_tid + "@weights_mem_p";
-    auto bias_key = key_tid + "@bias_mem_p";
-    auto user_src_key = key_tid + "@user_src_mem_p";
-    auto user_residual_key = key_tid + "@user_residual_data_mem_p";
-    auto src_reorder_key = key_tid + "@src_mem_preorder_p";
-    auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p";
-
-    conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    PADDLE_ENFORCE_NE(
+        is_conv3d, true,
+        platform::errors::Unimplemented(
+            "OneDNN int8 convolution does not support 3D inputs currently"));
+    PADDLE_ENFORCE_EQ(
+        fuse_residual_conn && force_fp32_output, false,
+        platform::errors::Unimplemented(
+            "residual fusion does not support force output with fp32"));
 
-    if (conv_pd == nullptr || !is_test) {
-      float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-      float fuse_beta = ctx.Attr<float>("fuse_beta");
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
+    auto* output = ctx.Output<Tensor>("Output");
 
-      auto* filter = ctx.Input<Tensor>("Filter");
+    ConvMKLDNNHandlerT<T, K, T_out> handler(
+        ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias,
+        output, ctx.InputName("Input") + ctx.InputName("Filter"));
 
-      PADDLE_ENFORCE_EQ(
-          filter->layout(), DataLayout::kMKLDNN,
-          platform::errors::InvalidArgument(
-              "The filter tensor's layout should be %d, but got %d.",
-              DataLayout::kMKLDNN, filter->layout()));
-      PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
-                        platform::errors::InvalidArgument(
-                            "Got wrong format for Filter tensor."));
+    auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
 
-      PADDLE_ENFORCE_GE(filter->dims().size(), 4,
-                        platform::errors::InvalidArgument(
-                            "Filter must be with 4 or 5 dimensions, i.e. OIHW "
-                            "or OIDHW, but got dimensions = %d .",
-                            filter->dims().size()));
-      PADDLE_ENFORCE_LE(filter->dims().size(), 5,
-                        platform::errors::InvalidArgument(
-                            "Filter must be with 4 or 5 dimensions, i.e. OIHW "
-                            "or OIDHW, but got dimensions = %d .",
-                            filter->dims().size()));
+    const auto& scale_weights_data =
+        ctx.Attr<std::vector<float>>("Scale_weights");
+    const bool is_multi_channel = scale_weights_data.size() > 1;
+    const int& groups = ctx.Attr<int>("groups");
+    const bool& is_test = ctx.Attr<bool>("is_test");
+    int mask_reorder =
+        is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
+    auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
+        filter, groups, false, is_test, scale_weights_data, mask_reorder);
 
+    std::shared_ptr<dnnl::memory> dst_memory_p;
+    if (fuse_residual_conn) {
+      auto* residual_param = ctx.Input<Tensor>("ResidualData");
       PADDLE_ENFORCE_EQ(
-          !fuse_residual_conn || !force_fp32_output, true,
-          platform::errors::Unimplemented(
-              "residual fusion does not support force output with fp32"));
-
-      auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
-
-      if (bias) {
-        PADDLE_ENFORCE_EQ(
-            bias->layout(), DataLayout::kMKLDNN,
-            platform::errors::InvalidArgument(
-                "The bias tensor's layout should be %d, but got %d.",
-                DataLayout::kMKLDNN, bias->layout()));
-        PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef,
-                          platform::errors::InvalidArgument(
-                              "Got wrong format for Bias tensor."));
-
-        PADDLE_ENFORCE_EQ(bias->dims().size(), 1,
-                          platform::errors::InvalidArgument(
-                              "Bias must only have 1 dimension, i.e. X, but "
-                              "got dimension = %d .",
-                              bias->dims().size()));
-      }
-
-      std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
-      std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
-
-      std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-      std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
-
-      std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
-      std::vector<int64_t> dilations(begin(dilations_temp),
-                                     end(dilations_temp));
-
-      std::string padding_algorithm =
-          ctx.Attr<std::string>("padding_algorithm");
-
-      bool is_conv3d = strides.size() == 3U;
-
-      PADDLE_ENFORCE_NE(is_conv3d, true,
-                        platform::errors::Unimplemented(
-                            "int8 does not support conv3d currently"));
-
-      auto input_dims = input->dims();
-      auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
-      auto filter_dims = filter->dims();
-      auto filter_data_dims =
-          framework::slice_ddim(filter_dims, 2, filter_dims.size());
-
-      auto ksize = framework::vectorize(filter_data_dims);
-
-      UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                               data_dims, strides, ksize);
-
-      int groups = ctx.Attr<int>("groups");
-      auto weights_tz = paddle::framework::vectorize(filter->dims());
-      int g = std::max(groups, 1);
-
-      platform::GetGroupConvWeightsTz(weights_tz, g);
-      auto dst_tz = paddle::framework::vectorize(output->dims());
-
-      std::transform(dilations.begin(), dilations.end(), dilations.begin(),
-                     [](int64_t i) { return i - 1; });
-
-      const K* filter_data = filter->data<K>();
-      auto scale_in_data = ctx.Attr<float>("Scale_in");
-      auto scale_in_eltwise_data = ctx.Attr<float>("Scale_in_eltwise");
-      auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-      auto scale_out_data =
-          force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
-      float sum_scale =
-          fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
-
-      bool is_multi_channel = scale_weights_data.size() > 1;
-
-      int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0]
-                                            : (weights_tz)[0])
-                                   : 1;
-      std::vector<float> output_shift_scale(count);
-#pragma omp parallel for if (count > 1)
-      for (int i = 0; i < count; i++) {
-        if (scale_weights_data[i] == 0.0)
-          output_shift_scale[i] =
-              scale_out_data;  // weights data will contain 0
-                               // in some models, then weights
-                               // scale couldn't be calculated
-        else
-          output_shift_scale[i] =
-              static_cast<float>(static_cast<double>(scale_out_data) /
-                                 (static_cast<double>(scale_in_data) *
-                                  static_cast<double>(scale_weights_data[i])));
-      }
-
-      auto user_src_md =
-          platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
-      auto user_weights_md = platform::MKLDNNMemDesc(
-          {weights_tz}, platform::MKLDNNGetDataType<K>(),
-          ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw);
-
-      /* create memory descriptor for convolution without specified format
-       * ('any') which lets a primitive (convolution in this case) choose
-       * the memory format preferred for best performance
-       */
-      auto chosen_memory_format = MKLDNNMemoryFormat::any;
-
-      std::vector<int64_t> bias_tz;
-
-      auto src_md =
-          platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
-      auto weights_md = platform::MKLDNNMemDesc(
-          weights_tz, memory::data_type::s8, chosen_memory_format);
-      auto dst_md = platform::MKLDNNMemDesc(
-          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
-
-      handler.reset(
-          new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key));
-      // create a conv primitive descriptor and save it for usage in backward
-      auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
-                                 : mkldnn::prop_kind::forward_training;
-
-      if (bias) {
-        bias_tz = paddle::framework::vectorize(bias->dims());
-        auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
-                                               MKLDNNMemoryFormat::x);
-        conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
-            src_md, weights_md, bias_md, dst_md, strides, dilations, paddings,
-            mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-            fuse_residual_conn, propagation, output_shift_scale, sum_scale);
-      } else {
-        conv_pd = handler->AcquireConvolutionPrimitiveDescriptor(
-            src_md, weights_md, paddle::none, dst_md, strides, dilations,
-            paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta,
-            fuse_residual_conn, propagation, output_shift_scale, sum_scale);
-      }
-
-      // create mkldnn memory from input tensors (data/weights)
-      user_src_memory_p =
-          handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-      auto user_weights_memory_p = handler->AcquireWeightsMemory(
-          user_weights_md, to_void_cast<K>(filter_data));
-
-      // create reorder primitive if the input format is not the preferred one
-      src_memory_p =
-          handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
-
-      std::shared_ptr<mkldnn::memory> weights_memory_p;
-      int mask_reorder =
-          is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
-      weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
-          user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
-          mask_reorder);
-
-      if (fuse_residual_conn) {
-        auto residual_param = ctx.Input<Tensor>("ResidualData");
-        PADDLE_ENFORCE_EQ(
-            output->dims(), residual_param->dims(),
-            platform::errors::InvalidArgument(
-                "Output and elementwise parameter need to have the "
-                "same dimension sizes, but got output's dimension = %d"
-                " and residual param's dimension =%d .",
-                output->dims().size(), residual_param->dims().size()));
-        auto residual_dt =
-            paddle::framework::ToMKLDNNDataType(residual_param->type());
-        if (residual_param->format() != handler->GetDstFormat()) {
-          auto residual_data_tz =
-              paddle::framework::vectorize(residual_param->dims());
-          auto user_residual_md = platform::MKLDNNMemDesc(
-              residual_data_tz, residual_dt, residual_param->format());
-          dst_memory_p = platform::SetDstMemory<T_out>(
-              ctx, output, residual_param, user_residual_md, handler,
-              &pipeline);
-        } else {
-          output->ShareDataWith(*residual_param);
-          dst_memory_p = platform::SetDstMemory<T_out>(ctx, output, handler);
-        }
-        need_s8_to_u8 =
-            (platform::MKLDNNGetDataType<T_out>() == memory::data_type::s8) &&
-            unsigned_output;
-      } else {
-        dst_memory_p = platform::SetDstMemory<T_out>(ctx, output, handler);
-      }
-
-      // create convolution op primitive
-      conv_p = handler->AcquireConvolution();
-      if (bias) {
-        const K* bias_data = bias->data<K>();
-        auto user_bias_md = platform::MKLDNNMemDesc(
-            {bias_tz}, platform::MKLDNNGetDataType<K>(), MKLDNNMemoryFormat::x);
-        auto user_bias_memory_p = handler->AcquireBiasMemory(
-            user_bias_md, to_void_cast<K>(bias_data));
-        std::shared_ptr<mkldnn::memory> bias_memory_p;
-        int mask_reorder = is_multi_channel ? 1 << 0 : 1;
-        int count =
-            is_multi_channel
-                ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
-                : 1;
-        std::vector<float> scale_bias_data(count);
-#pragma omp parallel for if (count > 1)
-        for (int i = 0; i < count; i++) {
-          scale_bias_data[i] = scale_in_data * scale_weights_data[i];
-        }
-        bias_memory_p = handler->AcquireBiasMemoryFromPrimitive(
-            user_bias_memory_p, pipeline, is_test, true, scale_bias_data,
-            mask_reorder);
-        conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
-                                  {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                  {MKLDNN_ARG_BIAS, *bias_memory_p},
-                                  {MKLDNN_ARG_DST, *dst_memory_p}});
-      } else {
-        conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
-                                  {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                  {MKLDNN_ARG_DST, *dst_memory_p}});
-      }
-    } else {
-      auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx.GetBlob(src_reorder_key));
-      src_memory_p =
-          std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
-      if (src_memory_reorder_p) {
-        user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(
-            dev_ctx.GetBlob(user_src_key));
-        user_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
-        {
-          platform::RecordEvent record_reorder("int_reorder",
-                                               platform::EventRole::kUniqueOp);
-          src_memory_reorder_p->execute(astream, *user_src_memory_p,
-                                        *src_memory_p);
-          astream.wait();
-        }
-      } else if (src_memory_p) {
-        src_memory_p->set_data_handle(to_void_cast<T>(input_data));
-      }
-      auto weights_memory_p = std::static_pointer_cast<mkldnn::memory>(
-          dev_ctx.GetBlob(weights_key));
+          output->dims(), residual_param->dims(),
+          platform::errors::InvalidArgument(
+              "Output and elementwise parameter need to have the "
+              "same dimension sizes, but got output's dimension = %d"
+              " and residual param's dimension =%d .",
+              output->dims().size(), residual_param->dims().size()));
       dst_memory_p =
-          std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
-      conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
-          dev_ctx.GetBlob(prim_key));
-      handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
-                                                    mkldnn_engine, key));
-
-      if (fuse_residual_conn) {
-        auto residual_param = ctx.Input<Tensor>("ResidualData");
-        output->ShareDataWith(*residual_param);
-        need_s8_to_u8 =
-            (platform::MKLDNNGetDataType<T_out>() == memory::data_type::s8) &&
-            unsigned_output;
-      }
-      platform::SetDstMemoryHandler<T_out>(ctx, output, handler, dst_memory_p);
+          handler.AcquireDstMemoryWithResidual(output, residual_param);
+      need_s8_to_u8 = (platform::MKLDNNGetDataType<T_out>() ==
+                       mkldnn::memory::data_type::s8) &&
+                      unsigned_output;
+    } else {
+      dst_memory_p = handler.template AcquireDstMemory<T_out>(output);
+    }
 
-      auto residual_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx.GetBlob(residual_reorder_key));
-      if (residual_reorder_p) {
-        auto user_residual_data_p = std::static_pointer_cast<mkldnn::memory>(
-            dev_ctx.GetBlob(user_residual_key));
-        {
-          platform::RecordEvent record_reorder("int_reorder",
-                                               platform::EventRole::kUniqueOp);
-          residual_reorder_p->execute(astream, *user_residual_data_p,
-                                      *dst_memory_p);
-          astream.wait();
-        }
-      }
+    auto conv_p = handler.AcquireForwardPrimitive();
+
+    std::unordered_map<int, dnnl::memory> args = {
+        {MKLDNN_ARG_SRC, *src_memory_p},
+        {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
+        {MKLDNN_ARG_DST, *dst_memory_p}};
 
-      auto bias_memory_p =
-          std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(bias_key));
+    if (bias) {
+      float mask_reorder;
+      std::vector<float> scale_bias_data;
+      std::tie(mask_reorder, scale_bias_data) =
+          handler.get_int8_bias_scales(ctx);
 
-      if (bias_memory_p) {
-        conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
-                                  {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                  {MKLDNN_ARG_BIAS, *bias_memory_p},
-                                  {MKLDNN_ARG_DST, *dst_memory_p}});
-      } else {
-        conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p},
-                                  {MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                  {MKLDNN_ARG_DST, *dst_memory_p}});
-      }
+      auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
+          bias, is_test, scale_bias_data, mask_reorder);
+      args.insert({MKLDNN_ARG_BIAS, *bias_memory_p});
     }
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    conv_p->execute(astream, args);
     astream.wait();
+
     if (need_s8_to_u8) {
       output->mutable_data<uint8_t>(ctx.GetPlace());
     }
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(GetMKLDNNFormat(*dst_memory_p));
+
+    output->set_layout(framework::DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
   }
 };
 
 template <typename T, typename K>
-class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+class ConvMKLDNNGradOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true,
-                      paddle::platform::errors::PreconditionNotMet(
+                      platform::errors::PreconditionNotMet(
                           "Operator DNNL ConvGrad must use CPUPlace"));
     auto& dev_ctx =
         ctx.template device_context<platform::MKLDNNDeviceContext>();
@@ -1105,18 +892,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                     {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}});
       astream.wait();
 
-      filter_grad->set_layout(DataLayout::kMKLDNN);
+      filter_grad->set_layout(framework::DataLayout::kMKLDNN);
       // in OneDNN groups in convolution are treated as separate dimension
       // which is not the case in paddlepaddle
-      auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
+      auto filter_fmt = platform::GetMKLDNNFormat(*diff_weights_memory_p);
 
       // For convolution with groups convert from blocked to NCHW
       // otherwise there will be problems in next operators working on this data
       if (g > 1) {
-        memory::data_type in_type = framework::ToMKLDNNDataType(filter->type());
+        mkldnn::memory::data_type in_type =
+            framework::ToMKLDNNDataType(filter->type());
         // for 3d conv with groups (six dimensional data reorder to goidhw)
         // for 2d conv with groups (five dimensional data reorder to goihw)
-        // auto weights_tz = paddle::framework::vectorize(filter->dims());
+        // auto weights_tz = framework::vectorize(filter->dims());
 
         auto weights_tz = diff_weights_memory_p->get_desc().dims();
         mkldnn::memory::format_tag out_format =
@@ -1168,8 +956,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                 {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}});
       astream.wait();
 
-      input_grad->set_layout(DataLayout::kMKLDNN);
-      input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p));
+      input_grad->set_layout(framework::DataLayout::kMKLDNN);
+      input_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p));
     }
   }
 };
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index f14f92cb51fdb..37fa58e423db7 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -531,7 +531,13 @@ inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) {
 inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) {
   return op->GetAttrIfExists<std::string>("mkldnn_data_type") == "float32";
 }
+
 enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP };
 
+template <typename T>
+bool constexpr is_int8() {
+  return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 1aa8c0cdb57f9..084b47bb3c7a3 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -527,7 +527,8 @@ class MKLDNNHandlerT {
       const mkldnn::memory::desc& user_md,
       const mkldnn::memory::desc& target_md, void* ptr,
       const std::string& suffix, bool is_persistent = false,
-      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {}) {
+      std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {},
+      const std::vector<float>& scale_data = {1.0f}, int mask = 0) {
     const auto target_key = key_ + suffix + "_target";
     const auto key_reorder_p = key_ + suffix + "reorder_p";
     const auto user_key = key_ + suffix + "_user";
@@ -546,8 +547,17 @@ class MKLDNNHandlerT {
           std::make_shared<dnnl::memory>(user_md, engine_, ptr);
       if (user_md != target_md) {
         target_memory_p = std::make_shared<mkldnn::memory>(target_md, engine_);
-        auto reorder_p =
-            std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
+        dnnl::reorder::primitive_desc reorder_pdesc;
+        if (is_int8<T>()) {
+          dnnl::primitive_attr attr;
+          attr.set_output_scales(mask, scale_data);
+          reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p,
+                                                        *target_memory_p, attr);
+        } else {
+          reorder_pdesc =
+              dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p);
+        }
+        auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
         dev_ctx_.SetBlob(key_reorder_p, reorder_p);
 
         auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
@@ -597,201 +607,6 @@ class MKLDNNHandlerT {
   std::shared_ptr<typename TBackward_params::primitive_desc> bwd_w_pd_;
 };
 
-// TODO(grygielski) this class will be deleted later.
-class MKLDNNHandler {
- public:
-  MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-                const std::string& base_key)
-      : dev_ctx_(dev_ctx),
-        engine_(engine),
-        key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) {
-    platform::MKLDNNDeviceContext::tls().log_lib_version();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::desc md, void* ptr, const std::string& suffix) {
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(md, engine_, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
-      mkldnn::memory::desc md, const std::string& suffix) {
-    const auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      mem_p = std::make_shared<mkldnn::memory>(md, engine_);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    }
-    return mem_p;
-  }
-
-  // This incarnation of AcquireMemory can call user function eg. custom reorder
-  // or preprocessing routine if needed
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
-      user_function custom_func = {}) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      // Call custom reorder/preprocessing func if available
-      if (custom_func) {
-        auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
-        dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
-        ptr = reinterpret_cast<void*>(reordered_data.get());
-      }
-
-      mem_p = std::make_shared<mkldnn::memory>(md, engine_, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::vector<int64_t>& dims, const mkldnn::memory::data_type dtype,
-      const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = key_ + suffix;
-    auto mem_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto md = mkldnn::memory::desc(dims, dtype, fmt);
-
-      mem_p = std::make_shared<mkldnn::memory>(md, engine_, ptr);
-      dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      const std::shared_ptr<mkldnn::memory>& user_memory_p,
-      const std::shared_ptr<mkldnn::memory>& target_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        dev_ctx_.GetBlob(key_reorder_p));
-
-    if (stored_reorder_p) {
-      pipeline.push_back(*stored_reorder_p);
-    } else {
-      auto reorder_p =
-          std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
-      dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-      auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-      platform::RecordEvent record_reorder("int_reorder",
-                                           platform::EventRole::kUniqueOp);
-      reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
-                                   {MKLDNN_ARG_TO, *target_memory_p}});
-      astream.wait();
-    }
-
-    return target_memory_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::desc& md,       // NOLINT
-      mkldnn::memory::desc& user_md,  // NOLINT
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f}, int mask = 0) {
-    // create reorder primitive if the input format is not the preferred one
-    auto local_key = key_ + suffix;
-    auto key_reorder_p = key_ + suffix + "reorder_p";
-
-    auto target_memory_p =
-        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    if (target_memory_p == nullptr) {
-      target_memory_p = user_memory_p;
-      if (md != user_md) {
-        target_memory_p = std::make_shared<mkldnn::memory>(md, engine_);
-        std::shared_ptr<mkldnn::reorder::primitive_desc> reorder_pd;
-        if (is_INT8) {
-          mkldnn::primitive_attr
-              attri;  // attribute for int8 weights and bias data reorder.
-          attri.set_output_scales(mask, scale_data);
-
-          reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
-              new mkldnn::reorder::primitive_desc(*user_memory_p,
-                                                  *target_memory_p, attri));
-        } else {
-          reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
-              new mkldnn::reorder::primitive_desc(*user_memory_p,
-                                                  *target_memory_p));
-        }
-        auto reorder_p =
-            std::shared_ptr<mkldnn::reorder>(new mkldnn::reorder(*reorder_pd));
-        dev_ctx_.SetBlob(key_reorder_p, reorder_p);
-
-        platform::RecordEvent record_reorder("int_reorder",
-                                             platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
-                                     {MKLDNN_ARG_TO, *target_memory_p}});
-        astream.wait();
-      }
-      dev_ctx_.SetBlob(local_key, target_memory_p);
-    } else if (!is_persistent) {
-      // Make reorder if needed
-      auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-          dev_ctx_.GetBlob(key_reorder_p));
-      if (reorder_p != nullptr) {
-        platform::RecordEvent record_reorder("int_reorder",
-                                             platform::EventRole::kUniqueOp);
-        reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p},
-                                     {MKLDNN_ARG_TO, *target_memory_p}});
-        astream.wait();
-      }
-    }
-    return target_memory_p;
-  }
-
- protected:
-  const MKLDNNDeviceContext& dev_ctx_;
-  mkldnn::engine engine_;
-  std::string key_;
-};
-
 template <typename T>
 class BinaryMKLDNNHandler
     : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
@@ -1143,362 +958,6 @@ class ReorderMKLDNNHandler {
   mkldnn::engine engine_;
 };
 
-template <typename T>
-struct convolutional_algorithm;
-
-template <>
-struct convolutional_algorithm<mkldnn::convolution_forward> {
-  static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct;
-};
-
-template <>
-struct convolutional_algorithm<mkldnn::deconvolution_forward> {
-  static constexpr mkldnn::algorithm T =
-      mkldnn::algorithm::deconvolution_direct;
-};
-
-template <class forward_t, class backward_data_t, class backward_weights_t>
-class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
- public:
-  ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx,
-                            mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
-
-  // TODO(jczaja): remove after conv int8 is adapted
-  ConvMKLDNNTemplateHandler(
-      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    conv_pd_ = conv_pd;
-  }
-
-  ConvMKLDNNTemplateHandler(
-      std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
-      std::shared_ptr<typename backward_data_t::primitive_desc>
-          conv_bwd_data_pd,
-      std::shared_ptr<typename backward_weights_t::primitive_desc>
-          conv_bwd_weights_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        conv_pd_(conv_pd),
-        conv_bwd_weights_pd_(conv_bwd_weights_pd),
-        conv_bwd_data_pd_(conv_bwd_data_pd) {
-    // If we are in Grad operatgor then update a key with BWD suffix to
-    // distinguish from FWD memory primitives
-    key_ += "-BWD";
-  }
-
-  size_t GetDstMemorySize() const { return conv_pd_->dst_desc().get_size(); }
-
-  MKLDNNMemoryFormat GetDstFormat() const {
-    return paddle::platform::GetMKLDNNFormat(conv_pd_->dst_desc());
-  }
-
-  size_t GetDiffWeightsMemorySize() const {
-    return conv_bwd_weights_pd_->diff_weights_desc().get_size();
-  }
-
-  size_t GetDiffSourceMemorySize() const {
-    return conv_bwd_data_pd_->diff_src_desc().get_size();
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_bwd_weights_pd_->src_desc();
-    auto user_pd = user_memory_p->get_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p,
-                               "@weights-src_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_desc();
-    auto user_pd = user_memory_p->get_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@weights-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
-      void) {
-    return this->AcquireMemoryFromPrimitive(
-        conv_bwd_weights_pd_->diff_weights_desc(), "@diff_weights_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_desc();
-    auto user_pd = user_memory_p->get_desc();
-    return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
-                               "@data-diff_dst_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto weights_pd = conv_bwd_data_pd_->weights_desc();
-    auto user_pd = user_weights_memory_p->get_desc();
-    return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
-                               "@data-weights_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
-      const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
-      void* dst_ptr,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    return this->AcquireMemory(user_residual_memory_p,
-                               this->AcquireDstMemoryFromPrimitive(dst_ptr),
-                               "@residual_data_mem_p", pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
-      void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_bwd_data_pd_->diff_src_desc(),
-                                            ptr, "@diff_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(conv_pd_->dst_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_memory_p,
-      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
-    auto src_pd = conv_pd_->src_desc();
-    auto user_pd = user_memory_p->get_desc();
-    return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
-                               pipeline);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
-      const mkldnn::memory::desc& md, void* ptr,
-      user_function custom_func = {}) {
-    return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f}, int mask = 0) {
-    auto user_weights_pd = user_weights_memory_p->get_desc();
-    auto weights_pd = conv_pd_->weights_desc();
-    return this->AcquireMemory(
-        weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p",
-        pipeline, is_persistent, is_INT8, scale_data, mask);
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
-      const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
-      std::vector<mkldnn::primitive>& pipeline,  // NOLINT
-      bool is_persistent = false, bool is_INT8 = false,
-      std::vector<float> scale_data = {1.0f},
-      int mask = 0) {  // NOLINT
-    auto user_bias_pd = user_bias_memory_p->get_desc();
-    auto bias_pd = conv_pd_->bias_desc();
-    return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
-                               "@bias_mem_p", pipeline, is_persistent, is_INT8,
-                               scale_data, mask);
-  }
-
-  mkldnn::primitive_attr CreatePostOps(
-      std::string fuse_activation, float fuse_alpha, float fuse_beta,
-      bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
-      float sum_scale = 1.0f) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    if (output_shift_scale.size() > 0) {
-      int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
-      conv_attr.set_output_scales(mask, output_shift_scale);
-    }
-    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_residual_connection is
-    // true, the output tensor contains the data coming from residual
-    // connection. The result of this post_op is:
-    // Output = scale * Output + Conv_Out.
-    if (fuse_residual_conn) {
-      post_operations.append_sum(sum_scale);
-    }
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
-      constexpr float scale = 1.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     fuse_alpha, fuse_beta);
-    } else if (fuse_activation == "relu6") {
-      constexpr float scale = 1.0f;
-      post_operations.append_eltwise(scale,
-                                     mkldnn::algorithm::eltwise_bounded_relu,
-                                     fuse_alpha, fuse_beta);
-    } else if (fuse_activation == "swish") {
-      constexpr float scale = 1.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish,
-                                     fuse_alpha, fuse_beta);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
-  std::shared_ptr<typename forward_t::primitive_desc>
-  AcquireConvolutionPrimitiveDescriptor(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      paddle::optional<const mkldnn::memory::desc&> bias,
-      const mkldnn::memory::desc& dst, const std::vector<int64_t>& strides,
-      const std::vector<int64_t>& dilations,
-      const std::vector<int64_t>& paddings, const mkldnn::engine& engine,
-      const std::string& fuse_activation, float fuse_alpha, float fuse_beta,
-      const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind,
-      const std::vector<float> output_shift_scale = {},
-      const float sum_scale = 1.0f) {
-    // Conv PD has to be passed to Grad op that
-    // may be exxecuted by diffrent thread, hence
-    // for that one we use key that does not contain TID
-    const std::string key_conv_pd = key_ + "@conv_pd";
-
-    conv_pd_ = std::static_pointer_cast<typename forward_t::primitive_desc>(
-        dev_ctx_.GetBlob(key_conv_pd));
-
-    if (conv_pd_ == nullptr) {
-      mkldnn::memory::dims stride_dims = strides;
-      mkldnn::memory::dims dilations_dims = dilations;
-      auto mkldnn_paddings = ToMkldnnPadding(paddings);
-
-      auto conv_desc =
-          bias ? typename forward_t::desc(
-                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
-                     weights, *bias, dst, stride_dims, dilations_dims,
-                     mkldnn_paddings[0], mkldnn_paddings[1])
-               : typename forward_t::desc(
-                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
-                     weights, dst, stride_dims, dilations_dims,
-                     mkldnn_paddings[0], mkldnn_paddings[1]);
-
-      mkldnn::primitive_attr conv_attr =
-          CreatePostOps(fuse_activation, fuse_alpha, fuse_beta,
-                        fuse_residual_conn, output_shift_scale, sum_scale);
-
-      conv_pd_.reset(
-          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
-      // Save conv_pd/src_memory/weights_memory for backward pass
-      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
-    }
-
-    return conv_pd_;
-  }
-
-  std::shared_ptr<forward_t> AcquireConvolution() {
-    auto prim_key = key_ + "@conv_p";
-    auto conv_p =
-        std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
-    if (conv_p == nullptr) {
-      conv_p = std::make_shared<forward_t>(*conv_pd_);
-
-      dev_ctx_.SetBlob(prim_key, conv_p);
-    }
-    return conv_p;
-  }
-
-  std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights() {
-    auto prim_key = key_ + "@conv_bwd_weights_p";
-    auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
-        dev_ctx_.GetBlob(prim_key));
-    if (conv_bwd_weights_p == nullptr) {
-      // create backward conv primitive for weights
-      conv_bwd_weights_p =
-          std::make_shared<backward_weights_t>(*conv_bwd_weights_pd_);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
-    }
-    return conv_bwd_weights_p;
-  }
-
-  std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData() {
-    auto prim_key = key_ + "@conv_bwd_data_p";
-    auto conv_bwd_data_p =
-        std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
-    if (conv_bwd_data_p == nullptr) {
-      conv_bwd_data_p = std::make_shared<backward_data_t>(*conv_bwd_data_pd_);
-      dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
-    }
-    return conv_bwd_data_p;
-  }
-
- private:
-  std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
-  std::shared_ptr<typename backward_weights_t::primitive_desc>
-      conv_bwd_weights_pd_;
-  std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
-};
-
-using ConvMKLDNNHandler =
-    ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
-                              mkldnn::convolution_backward_data,
-                              mkldnn::convolution_backward_weights>;
-
-template <typename T>
-static std::shared_ptr<mkldnn::memory> SetDstMemory(
-    const framework::ExecutionContext& ctx, framework::Tensor* output,
-    const std::shared_ptr<ConvMKLDNNHandler>& handler) {
-  T* output_data =
-      output->mutable_data<T>(ctx.GetPlace(), handler->GetDstMemorySize());
-  std::shared_ptr<mkldnn::memory> dst_memory_p =
-      handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
-  return dst_memory_p;
-}
-
-template <typename T>
-static std::shared_ptr<mkldnn::memory> SetDstMemory(
-    const framework::ExecutionContext& ctx, framework::Tensor* output,
-    const framework::Tensor* residual_param,
-    const mkldnn::memory::desc& user_residual_md,
-    const std::shared_ptr<ConvMKLDNNHandler>& handler,
-    std::vector<mkldnn::primitive>* pipeline) {
-  const T* residual_param_data = residual_param->data<T>();
-  PADDLE_ENFORCE_NOT_NULL(
-      residual_param_data,
-      platform::errors::PreconditionNotMet("Residual parameter is required for "
-                                           "the DNNL conv+elementwise_add "
-                                           "fusion, but now it is missing."));
-  std::shared_ptr<mkldnn::memory> user_residual_memory_p =
-      handler->AcquireResidualDataMemory(user_residual_md,
-                                         to_void_cast<T>(residual_param_data));
-  T* output_data = output->mutable_data<T>(ctx.GetPlace());
-  std::shared_ptr<mkldnn::memory> dst_memory_p =
-      handler->AcquireDstMemoryFromResidualDataMemory(
-          user_residual_memory_p, to_void_cast<T>(output_data), *pipeline);
-  return dst_memory_p;
-}
-
-template <typename T>
-static void SetDstMemoryHandler(
-    const framework::ExecutionContext& ctx, framework::Tensor* output,
-    const std::shared_ptr<ConvMKLDNNHandler>& handler,
-    std::shared_ptr<mkldnn::memory> dst_memory_p) {
-  T* output_data =
-      output->mutable_data<T>(ctx.GetPlace(), handler->GetDstMemorySize());
-  dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
-}
-
 template <typename T>
 static void SetDstMemoryQuantized(
     const framework::ExecutionContext& ctx, framework::Tensor* output,
@@ -1524,5 +983,6 @@ static void SetDstMemoryQuantized(
   dst_memory.reset(
       new mkldnn::memory(*dst_md, engine, to_void_cast<T>(output_data)));
 }
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index db05801c7227b..8ea4e369d3236 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -20,7 +20,8 @@
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
-from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, get_numeric_gradient)
 from paddle.fluid.tests.unittests.testsuite import create_op
 from paddle.fluid import Program, program_guard
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 027c806fc02e9..89125dc326d15 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -22,7 +22,7 @@
 paddle.enable_static()
 import paddle.fluid.core as core
 import paddle.fluid as fluid
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 
 
 def conv2dtranspose_forward_naive(input_, filter_, attrs):

From 730dcaf48f6b1e0e561860eb503ceef9a9498b59 Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Thu, 7 Oct 2021 22:06:21 +0800
Subject: [PATCH 21/80] fix bugs in HybridParallelClipGrad of
 hybrid_parallel_optimizer (#36237)

* fix bugs in HybridParallelClipGrad of hybrid_parallel_optimizer

* update

* update
---
 .../hybrid_parallel_optimizer.py              | 38 +++++++++++++------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 581fbc5153ad4..b00ef2cdcb0e1 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -50,7 +50,8 @@ def __init__(self, clip, hcg):
     @imperative_base.no_grad
     def _dygraph_clip(self, params_grads):
         params_and_grads = []
-        sum_square_list = []
+        sum_square_list_dist = []
+        sum_square_list_not_dist = []
         for p, g in params_grads:
             if g is None:
                 continue
@@ -62,18 +63,33 @@ def _dygraph_clip(self, params_grads):
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
             square = layers.square(merge_grad)
             sum_square = layers.reduce_sum(square)
-            sum_square_list.append(sum_square)
+
+            if p.is_distributed:
+                sum_square_list_dist.append(sum_square)
+            else:
+                sum_square_list_not_dist.append(sum_square)
 
         # all parameters have been filterd out
-        if len(sum_square_list) == 0:
+        if len(sum_square_list_dist) + len(sum_square_list_not_dist) == 0:
             return params_grads
 
-        global_norm_var = layers.concat(sum_square_list)
-        global_norm_var = layers.reduce_sum(global_norm_var)
-        # add all reduce to get global norm in world size
-        paddle.distributed.all_reduce(global_norm_var,
-                                      self._hcg.get_check_parallel_group())
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var_dist = layers.concat(sum_square_list_dist) if len(
+            sum_square_list_dist) != 0 else layers.concat(
+                [paddle.to_tensor([0.])])
+        global_norm_var_dist = layers.reduce_sum(global_norm_var_dist)
+        global_norm_var_not_dist = layers.concat(
+            sum_square_list_not_dist) if len(
+                sum_square_list_not_dist) != 0 else layers.concat(
+                    [paddle.to_tensor([0.])])
+        global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist)
+
+        # add all reduce to get global norm of distributed params_and_grads in world size
+        # all reduce is not needed while getting global norm of non-distributed params_and_grads
+        paddle.distributed.all_reduce(
+            global_norm_var_dist, group=self._hcg.get_check_parallel_group())
+
+        global_norm_var = layers.sqrt(global_norm_var_dist +
+                                      global_norm_var_not_dist)
 
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
@@ -96,7 +112,7 @@ def __getattr__(self, item):
         return getattr(self._clip, item)
 
     def __call__(self, params_grads):
-        return self._clip(params_grads)
+        return self._dygraph_clip(params_grads)
 
 
 class HybridParallelOptimizer:
@@ -112,7 +128,7 @@ def __init__(self, optimizer, hcg, strategy):
         self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
 
         # NOTE(shenliang03): Because of the pure DataParallel mode, the gradient synchronization 
-        # is achieved through reducer, so there is no need to call fuse_allreduce in oprimizer. 
+        # is achieved through reducer, so there is no need to call fuse_allreduce in optimizer. 
         self._dp_enable = not self._use_dp_mode and self._need_dp
 
         self._sharding_enable = (

From 9814f89551e2133c6733352f6445d4d668da6f63 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Fri, 8 Oct 2021 10:47:13 +0800
Subject: [PATCH 22/80] fix cast cuda implementation (#36266)

---
 paddle/fluid/operators/cast_op.cu | 64 ++++++++++++++++---------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 06300817e0a12..601735c2f148a 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -47,12 +47,12 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
 }
 
 template <typename InT>
-struct CastOpFunctor<platform::CUDADeviceContext, InT> {
+struct CastCUDAOpFunctor {
   const framework::Tensor* in_;
   framework::Tensor* out_;
   const platform::CUDADeviceContext& ctx_;
-  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
-                const platform::CUDADeviceContext& ctx)
+  CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                    const platform::CUDADeviceContext& ctx)
       : in_(in), out_(out), ctx_(ctx) {}
 
   template <typename OutT>
@@ -75,6 +75,21 @@ struct CastOpFunctor<platform::CUDADeviceContext, InT> {
   }
 };
 
+template <typename InT>
+class CastCUDAOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(
+            context.Attr<int>("out_dtype")),
+        CastCUDAOpFunctor<InT>(
+            in, out,
+            context.template device_context<platform::CUDADeviceContext>()));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -82,34 +97,21 @@ namespace ops = paddle::operators;
 
 #ifdef PADDLE_WITH_HIP
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #else
 REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::float16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::bfloat16>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<float>>,
-    ops::CastOpKernel<paddle::platform::CUDADeviceContext,
-                      paddle::platform::complex<double>>);
+    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
+    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
+    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
+    ops::CastCUDAOpKernel<uint8_t>,
+    ops::CastCUDAOpKernel<paddle::platform::float16>,
+    ops::CastCUDAOpKernel<paddle::platform::bfloat16>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
+    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
 #endif

From 1bd9cfef4e27baa84fd40ed1e65e80017d0cf232 Mon Sep 17 00:00:00 2001
From: arlesniak <artur.lesniak@intel.com>
Date: Fri, 8 Oct 2021 05:33:09 +0200
Subject: [PATCH 23/80] Added oneDNN BF16 relu (#36265)

* Added oneDNN BF16 relu

* fixed typo

* refactored test, review fixes
---
 .../operators/mkldnn/activation_mkldnn_op.cc  |  3 +-
 .../mkldnn/test_activation_bf16_mkldnn_op.py  | 44 ++++++++++++++++---
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index d992890adeec3..603a70458b0ce 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -257,7 +257,6 @@ namespace ops = paddle::operators;
           ops::grad_functor<paddle::platform::bfloat16>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                           \
-  __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);                \
   __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);             \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);             \
@@ -267,6 +266,8 @@ namespace ops = paddle::operators;
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
+REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor,
+                                       ReluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
index 3d5a013915833..cd9987b3c8e82 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 
+import six
+import abc
 import unittest
 import numpy as np
 from scipy.special import expit, erf
@@ -24,15 +26,19 @@
 
 
 @OpTestTool.skip_if_not_cpu_bf16()
-class TestMKLDNNSigmoidBF16Op(TestActivation):
+@six.add_metaclass(abc.ABCMeta)
+class MKLDNNBF16ActivationOp(object):
+    @abc.abstractmethod
     def config(self):
-        self.op_type = "sigmoid"
+        pass
 
+    @abc.abstractmethod
     def op_forward(self, x):
-        return 1 / (1 + np.exp(-x))
+        pass
 
+    @abc.abstractmethod
     def op_grad(self, dout, x):
-        return dout * self.op_forward(x) * (1 - self.op_forward(x))
+        pass
 
     def set_attrs(self):
         self.attrs = {"use_mkldnn": True}
@@ -65,7 +71,18 @@ def test_check_grad(self):
             user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
 
 
-class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op):
+class TestMKLDNNSigmoidBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+    def config(self):
+        self.op_type = "sigmoid"
+
+    def op_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def op_grad(self, dout, x):
+        return dout * self.op_forward(x) * (1 - self.op_forward(x))
+
+
+class TestMKLDNNGeluErfBF16Op(MKLDNNBF16ActivationOp, TestActivation):
     def config(self):
         self.op_type = "gelu"
 
@@ -83,7 +100,7 @@ def init_data(self):
         self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
 
 
-class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op):
+class TestMKLDNNGeluTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation):
     def config(self):
         self.op_type = "gelu"
 
@@ -104,3 +121,18 @@ def set_attrs(self):
 class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op):
     def init_data(self):
         self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+
+
+class TestMKLDNNReluBF16Op(MKLDNNBF16ActivationOp, TestActivation):
+    def config(self):
+        self.op_type = "relu"
+
+    def op_forward(self, x):
+        return np.maximum(x, 0)
+
+    def op_grad(self, dout, x):
+        return dout
+
+
+if __name__ == '__main__':
+    unittest.main()

From a29ff4c77a658f1265b56b3cb9b3a7ad7f296f73 Mon Sep 17 00:00:00 2001
From: huangxu96 <46740794+huangxu96@users.noreply.github.com>
Date: Fri, 8 Oct 2021 16:19:16 +0800
Subject: [PATCH 24/80] add python interface of sub_graph (#36120)

Add python interface of subgraph: 1. all_sub_graphs() 2. get_sub_graph(idx)
---
 paddle/fluid/pybind/ir.cc                     | 10 +-
 python/paddle/fluid/framework.py              | 26 ++++-
 .../ir/test_ir_subgraph_python_interface.py   | 96 +++++++++++++++++++
 3 files changed, 128 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py

diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index e27e3674eeeb5..050bfc967daa1 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -125,7 +125,15 @@ void BindGraph(py::module *m) {
            return_value_policy::reference)
       .def("resolve_hazard", &Graph::ResolveHazard)
       .def("origin_program_desc", &Graph::OriginProgram,
-           return_value_policy::reference);
+           return_value_policy::reference)
+      .def("sub_graph_size", &Graph::SubGraphsSize)
+      .def("get_sub_graph", [](Graph &self, int i) {
+        /* Here we use a lambda function as an empty deleter to avoid the double
+        free of smart pointer.
+        Otherwise, this shared pointer will be free both in python and
+        cpp scope, which will lead a core dumped. */
+        return std::shared_ptr<Graph>(self.GetSubGraph(i), [](Graph *) {});
+      });
 }
 
 void BindNode(py::module *m) {
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b6241f6e5299d..7f2937b9af764 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3956,6 +3956,23 @@ def all_op_nodes(self):
         """
         return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()}
 
+    def all_sub_graphs(self, for_test=False):
+        """
+        Return all sub_graphs included in the main graph as a set.
+        """
+
+        return [
+            IrGraph(
+                self.graph.get_sub_graph(i), for_test=for_test)
+            for i in range(self.graph.sub_graph_size())
+        ]
+
+    def get_sub_graph(self, i, for_test=False):
+        """
+        Return i-th sub_graph in the main graph.
+        """
+        return IrGraph(self.graph.get_sub_graph(i), for_test=for_test)
+
     def create_persistable_node(self, name, var_type, shape, var_dtype):
         """
         Create a persistable variable node in the graph. In IrGraph,
@@ -4102,8 +4119,10 @@ def link_to(self, node_in, node_out):
             node_in(IrNode): the input node.
             node_out(IrNode): the output node.
         """
-        assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \
-            'The two arguments(node_in&node_out) must be in the graph nodes.'
+        assert node_in.node in self.graph.nodes(), (
+            'node_in(%s) must be in the graph nodes.' % node_in.node.name())
+        assert node_out.node in self.graph.nodes(), (
+            'node_out(%s) must be in the graph nodes.' % node_out.node.name())
         node_in.append_output(node_out)
         node_out.append_input(node_in)
 
@@ -4265,7 +4284,8 @@ def _find_node_by_name(self, nodes, node_name):
         for n in nodes:
             if n.name() == node_name:
                 target_node = n
-        assert target_node is not None, "Cannot find the target node in the giving set."
+        assert target_node is not None, (
+            "Cannot find the target node (%s)in the giving set." % node_name)
         return target_node
 
     def _update_desc_attr(self, desc, name, val):
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
new file mode 100644
index 0000000000000..49ca89a35f4ac
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import six
+
+from paddle.fluid.framework import IrGraph
+from paddle.fluid.framework import IrNode
+from paddle.fluid.tests.unittests.op_test import OpTestTool
+from paddle.fluid import core
+import paddle.fluid.layers as layers
+from paddle.fluid.framework import Program, program_guard, default_startup_program
+from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass
+
+paddle.enable_static()
+
+
+class TestQuantizationSubGraph(unittest.TestCase):
+    def build_graph_with_sub_graph(self):
+        def linear_fc(num):
+            data = fluid.layers.data(
+                name='image', shape=[1, 32, 32], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            hidden = data
+            for _ in six.moves.xrange(num):
+                hidden = fluid.layers.fc(hidden, size=128, act='relu')
+            loss = fluid.layers.cross_entropy(input=hidden, label=label)
+            loss = fluid.layers.mean(loss)
+            return loss
+
+        main_program = Program()
+        startup_program = Program()
+
+        def true_func():
+            return linear_fc(3)
+
+        def false_func():
+            return linear_fc(5)
+
+        with program_guard(main_program, startup_program):
+            x = layers.fill_constant(shape=[1], dtype='float32', value=0.1)
+            y = layers.fill_constant(shape=[1], dtype='float32', value=0.23)
+            pred = layers.less_than(y, x)
+            out = layers.cond(pred, true_func, false_func)
+
+        core_graph = core.Graph(main_program.desc)
+        # We should create graph for test, otherwise it will throw a 
+        # error that it cannot find the node of "STEP_COUNTER"
+        graph = IrGraph(core_graph, for_test=True)
+        sub_graph = graph.get_sub_graph(0)
+        all_sub_graphs = graph.all_sub_graphs(
+            for_test=True)  # same reason for subgraph
+        # Should return graph and sub_graphs at the same time. If only return sub_graph, the graph will
+        # be destructed and the sub_graphs will be empty.
+        return graph, all_sub_graphs
+
+    def test_quant_sub_graphs(self, use_cuda=False):
+        graph, sub_graphs = self.build_graph_with_sub_graph()
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        transform_pass = QuantizationTransformPass(
+            scope=fluid.global_scope(),
+            place=place,
+            activation_quantize_type='abs_max',
+            weight_quantize_type='range_abs_max')
+        Find_inserted_quant_op = False
+        for sub_graph in sub_graphs:
+            transform_pass.apply(sub_graph)
+            for op in sub_graph.all_op_nodes():
+                if 'quantize' in op.name():
+                    Find_inserted_quant_op = True
+        self.assertTrue(Find_inserted_quant_op)
+
+    def test_quant_sub_graphs_cpu(self):
+        self.test_quant_sub_graphs(use_cuda=False)
+
+    @OpTestTool.skip_if(not paddle.is_compiled_with_cuda(),
+                        "Not GPU version paddle")
+    def test_quant_sub_graphs_gpu(self):
+        self.test_quant_sub_graphs(use_cuda=True)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7cb19f575f8ff7e8f4d03fd70a5fc33c76360a36 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Fri, 8 Oct 2021 16:44:01 +0800
Subject: [PATCH 25/80] [NPU] BatchNorm support layout of NCL and NLC,
 test=develop (#35668)

* [NPU] support NCL and NCL for BatchNorm, test=develop

* [NPU] remove debug files, test=develop

* update, test=develop
---
 paddle/fluid/operators/batch_norm_op_npu.cc   | 62 ++++++++++++++-----
 paddle/fluid/operators/conv_op_npu.cc         |  5 --
 .../unittests/npu/test_batch_norm_op_npu.py   | 54 +++++++++++++++-
 .../tests/unittests/test_batch_norm_op.py     | 37 ++++++++++-
 4 files changed, 133 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc
index dfb620a4e96bd..791c3656791da 100644
--- a/paddle/fluid/operators/batch_norm_op_npu.cc
+++ b/paddle/fluid/operators/batch_norm_op_npu.cc
@@ -38,11 +38,13 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE_EQ(x_dims.size(), 4,
-                      platform::errors::InvalidArgument(
-                          "The input tensor X's dimension must equal to 4. But "
-                          "received X's shape = [%s], X's dimension = [%d].",
-                          x_dims, x_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        (x_dims.size() == 4UL || x_dims.size() == 3UL), true,
+        platform::errors::InvalidArgument(
+            "The input tensor X's dimension must equal to 3 or 4. "
+            " But got X's shape = [%s], X's dimension = [%d].",
+            x_dims.to_str(), x_dims.size()));
+
     const auto *running_mean = ctx.Input<Tensor>("Mean");
     const auto *running_var = ctx.Input<Tensor>("Variance");
     const auto *scale = ctx.Input<Tensor>("Scale");
@@ -51,8 +53,11 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
     auto *y = ctx.Output<Tensor>("Y");
     y->mutable_data<T>(ctx.GetPlace());
 
-    Tensor x_tensor(x->type());
-    Tensor y_tesnor(y->type());
+    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
+    auto x_tensor =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
+    auto y_tesnor =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(y->dims(), dev_ctx);
     x_tensor.ShareDataWith(*x);
     y_tesnor.ShareDataWith(*y);
     if (data_layout == DataLayout::kNHWC) {
@@ -89,6 +94,18 @@ class NPUBatchNormOpKernel : public framework::OpKernel<T> {
       sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
       square_sum.mutable_data<float>(running_mean->dims(), ctx.GetPlace());
 
+      // BNTrainingReduce ONLY support rank = 4
+      if (x->dims().size() == 3) {
+        auto x_shape_vec = framework::vectorize(x->dims());
+        if (data_layout == DataLayout::kNCHW) {
+          x_shape_vec.push_back(1);  // expand NCL -> NCL1
+        } else {
+          x_shape_vec.insert(x_shape_vec.begin() + 2, 1);  // expand NLC -> NL1C
+        }
+        auto x_new_shape = framework::make_ddim(x_shape_vec);
+        x_tensor.Resize(x_new_shape);
+        x_tensor.Resize(x_new_shape);
+      }
       const auto &runner_reduce =
           NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum},
                       {{"epsilon", epsilon}});
@@ -127,8 +144,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
 
     use_global_stats = is_test || use_global_stats;
 
-    Tensor x_tensor(x->type());
-    Tensor dy_tensor(d_y->type());
+    auto &dev_ctx = ctx.template device_context<NPUDeviceContext>();
+    auto x_tensor =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(x->dims(), dev_ctx);
+    auto dy_tensor =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_y->dims(), dev_ctx);
     x_tensor.ShareDataWith(*x);
     dy_tensor.ShareDataWith(*d_y);
     if (data_layout == DataLayout::kNHWC) {
@@ -136,14 +156,14 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
       dy_tensor.set_layout(DataLayout::kNHWC);
     }
 
-    Tensor scale_grad_tmp(scale->type());
-    Tensor bias_grad_tmp(bias->type());
+    auto scale_grad_tmp =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(scale->dims(), dev_ctx);
+    auto bias_grad_tmp =
+        ctx.AllocateTmpTensor<T, NPUDeviceContext>(bias->dims(), dev_ctx);
     if (d_scale == nullptr) {
-      scale_grad_tmp.Resize(scale->dims());
       d_scale = &scale_grad_tmp;
     }
     if (d_bias == nullptr) {
-      bias_grad_tmp.Resize(bias->dims());
       d_bias = &bias_grad_tmp;
     }
 
@@ -169,9 +189,23 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel<T> {
     }
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_tensor(d_x->type());
+      auto dx_tensor =
+          ctx.AllocateTmpTensor<T, NPUDeviceContext>(d_x->dims(), dev_ctx);
       dx_tensor.ShareDataWith(*d_x);
       if (use_global_stats) {
+        if (x->dims().size() == 3) {
+          // BNInferGrad only support x rank = 4,
+          auto x_shape_vec = framework::vectorize(d_x->dims());
+          if (data_layout == DataLayout::kNCHW) {
+            x_shape_vec.push_back(1);  // expand NCL -> NCL1
+          } else {
+            x_shape_vec.insert(x_shape_vec.begin() + 2,
+                               1);  // expand NLC -> NL1C
+          }
+          auto x_new_shape = framework::make_ddim(x_shape_vec);
+          dx_tensor.Resize(x_new_shape);
+          dy_tensor.Resize(x_new_shape);
+        }
         const auto *running_var = ctx.Input<Tensor>("Variance");
         const auto &runner_infer =
             NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var},
diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc
index 86724e06975ed..47de843d1ac6f 100644
--- a/paddle/fluid/operators/conv_op_npu.cc
+++ b/paddle/fluid/operators/conv_op_npu.cc
@@ -186,11 +186,6 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
       dilations[3] = dilation[1];
     }
 
-    // LOG(INFO) << "strides = " << framework::make_ddim(strides).to_str();
-    // LOG(INFO) << "dilations = " << framework::make_ddim(dilations).to_str();
-    // LOG(INFO) << "padding = " << framework::make_ddim(padding).to_str();
-    // LOG(INFO) << "data_format = " << data_format;
-
     if (filter_grad) {
       filter_grad->mutable_data<T>(ctx.GetPlace());
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
index 1b8b13a0d27ea..877f9904f3407 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py
@@ -45,6 +45,14 @@ def check_with_place(self, place, data_layout, dtype, shape):
         if len(shape) == 2:
             x_shape = shape
             c = x_shape[1]
+        if len(shape) == 3:
+            n, l, c = shape[0], shape[1], shape[2]
+            if data_layout == "NHWC":  # NLC
+                x_shape = [n, l, c]
+            elif data_layout == "NCHW":  # NCL
+                x_shape = [n, c, l]
+            else:
+                raise ValueError("Unknown data layout.")
         else:
             n, h, w, c = shape[0], shape[1], shape[2], shape[3]
             if data_layout == "NHWC":
@@ -117,6 +125,7 @@ def test_check_output(self):
         place = core.NPUPlace(0)
         for data_format in self.data_formats:
             self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5])
+            self.check_with_place(place, data_format, self.dtype, [3, 8, 5])
 
     def init_kernel_type(self):
         pass
@@ -185,10 +194,19 @@ def test_with_place(place, data_layout, shape):
             # attr
             epsilon = self.epsilon
             momentum = self.momentum
-            if data_layout == "NCHW":
-                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+
+            if len(shape) == 3:
+                if data_layout == "NHWC":  # NLC
+                    n, l, c = shape[0], shape[1], shape[2]
+                elif data_layout == "NCHW":  # NCL
+                    n, c, l = shape[0], shape[1], shape[2]
+                else:
+                    raise ValueError("Unknown data layout.")
             else:
-                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_layout == "NCHW":
+                    n, c, h, w = shape[0], shape[1], shape[2], shape[3]
+                else:
+                    n, h, w, c = shape[0], shape[1], shape[2], shape[3]
             scale_shape = [c]
 
             np.random.seed(123)
@@ -296,6 +314,7 @@ def test_with_place(place, data_layout, shape):
 
         for data_format in self.data_formats:
             test_with_place(core.NPUPlace(0), data_format, [2, 3, 4, 5])
+            test_with_place(core.NPUPlace(0), data_format, [3, 8, 5])
 
     def init_kernel_type(self):
         pass
@@ -328,6 +347,17 @@ def init_test_case(self):
         ]
 
     def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
+        x_shape = x.shape
+        if len(x_shape) == 3:
+            if data_format == "NCHW":  # NCL -> NCL1
+                x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+                y_grad = np.reshape(y_grad,
+                                    (x_shape[0], x_shape[1], x_shape[2], 1))
+            else:  # NLC -> NL1C
+                x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+                y_grad = np.reshape(y_grad,
+                                    (x_shape[0], x_shape[1], 1, x_shape[2]))
+
         if data_format == "NCHW":
             x = np.transpose(x, (0, 2, 3, 1))
             y_grad = np.transpose(y_grad, (0, 2, 3, 1))
@@ -343,6 +373,9 @@ def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
             x = np.transpose(x, (0, 3, 1, 2))
             y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
+        if len(x_shape) == 3:
+            x_grad = np.reshape(x_grad, x_shape)
+
         return x_grad, grad_scale, grad_offset
 
     def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
@@ -350,6 +383,17 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         if data_layout != "NCHW" and data_layout != "NHWC":
             raise ValueError("Unknown data order.")
 
+        x_shape = x.shape
+        if len(x_shape) == 3:
+            if data_layout == "NCHW":  # NCL -> NCL1
+                x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+                y_grad = np.reshape(y_grad,
+                                    (x_shape[0], x_shape[1], x_shape[2], 1))
+            else:  # NLC -> NL1C
+                x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+                y_grad = np.reshape(y_grad,
+                                    (x_shape[0], x_shape[1], 1, x_shape[2]))
+
         if data_layout == "NCHW":
             x = np.transpose(x, (0, 2, 3, 1))
 
@@ -369,6 +413,10 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
         x_grad, scale_grad, bias_grad = self.reference_grad(
             x, y_grad, scale, mean, variance, epsilon, data_layout)
 
+        if len(x_shape) == 3:
+            y = np.reshape(y, x_shape)
+            x_grad = np.reshape(x_grad, x_shape)
+
         return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 9eaa69ce64428..cce13a8bf3b74 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -36,6 +36,11 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
             x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
         else:
             x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
 
     if data_format == "NCHW":
         n, c, h, w = x.shape
@@ -55,13 +60,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     else:
         raise ValueError("Unknown data order.")
 
-    if len(x_shape) == 2:
+    if len(x_shape) == 2 or len(x_shape) == 3:
         y = np.reshape(y, x_shape)
     return y
 
 
 def _cal_mean_variance(x, epsilon, data_format):
     assert data_format in ['NCHW', 'NHWC']
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
     x_square = x * x
     axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
     C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
@@ -76,6 +87,12 @@ def _cal_mean_variance(x, epsilon, data_format):
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
 
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -94,7 +111,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
-        return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
         x_square_sum = np.sum(x_square, (0, 1, 2))
@@ -104,10 +120,13 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
         y = normalized * scale + offset
-        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
+    if len(x_shape) == 3:
+        y = np.reshape(y, x_shape)
+    return y, mean, var
+
 
 def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     # Use the following formulas to calculate gradients:
@@ -124,6 +143,15 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     if data_format != "NCHW" and data_format != "NHWC":
         raise ValueError("Unknown data order.")
 
+    x_shape = x.shape
+    if len(x_shape) == 3:
+        if data_format == "NCHW":  # NCL -> NCL1
+            x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1))
+        else:  # NLC -> NL1C
+            x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2]))
+            y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         y_grad = np.transpose(y_grad, (0, 2, 3, 1))
@@ -142,6 +170,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
         x = np.transpose(x, (0, 3, 1, 2))
         y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
+    if len(x_shape) == 3:
+        x_grad = np.reshape(x_grad, x_shape)
+
     return x_grad, grad_scale, grad_offset
 
 

From ca16e8fd7bd1bf27abb9b2cea053b9f98eddea76 Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Fri, 8 Oct 2021 16:52:05 +0800
Subject: [PATCH 26/80] add fs list_files_info (#36224)

---
 python/paddle/distributed/fleet/utils/fs.py   | 32 +++++++++++++++++++
 .../fluid/tests/unittests/hdfs_test_utils.py  |  9 ++++++
 .../fluid/tests/unittests/test_hdfs2.py       |  1 +
 3 files changed, 42 insertions(+)

diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index fb518f62a1269..d3f84d50ac8f9 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -1106,3 +1106,35 @@ def _split_files(self, files, trainer_id, trainers):
             begin += blocks[i]
 
         return trainer_files[trainer_id]
+
+    def list_files_info(self, path_list):
+        """
+        list_files return file path and size
+        Args:
+            path_list(list): file list
+        Returns:
+            fileist(list): file list with file path and size
+        """
+        if len(path_list) <= 0:
+            return []
+
+        file_list = []
+
+        #concat filelist can speed up 'hadoop ls'
+        str_concat = ""
+        for path in path_list:
+            str_concat += path + " "
+        cmd = "ls " + str_concat + " | awk '{if ($8 != \"\") {print $5\" \"$8 }}'"
+        ret, lines = self._run_cmd(cmd)
+        if (len(lines) == 0):
+            logger.warning("list_files empty, path[%s]" % path_list)
+            return []
+        for line in lines:
+            arr = line.split(' ')
+            if len(arr) < 2:
+                continue
+            file_path = arr[1]
+            file_size = int(arr[0])
+            file_list.append({'path': file_path, 'size': file_size})
+
+        return file_list
diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
index 1535fac499ec6..6b49049073948 100644
--- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
+++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py
@@ -245,6 +245,15 @@ def _test_touch(self, fs):
         self.assertFalse(fs.is_dir(path))
         fs.delete(path)
 
+    def _test_list_files_info(self, fs):
+        path = []
+        fs.list_files_info(path)
+        path = ["./list_files_info.flag"]
+        fs.list_files_info(path)
+        fs.touch(path, exist_ok=True)
+        fs.list_files_info(path)
+        fs.delete(path)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py
index 1fa019bb9cd02..a74fc558382fe 100644
--- a/python/paddle/fluid/tests/unittests/test_hdfs2.py
+++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py
@@ -35,6 +35,7 @@ def test_hdfs(self):
         self._test_rm(fs)
         self._test_touch(fs)
         self._test_dirs(fs)
+        self._test_list_files_info(fs)
 
     def test_local(self):
         fs = LocalFS()

From f9591bb172e7274a77bfdcb6493579824aec8b47 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Fri, 8 Oct 2021 18:06:26 +0800
Subject: [PATCH 27/80] Support CUDA Graph on ParallelExecutor (#36250)

* support CUDA Graph on PE

* add ut, fix CI compile

* reduce memory consumption

* fix CUDA 10 CI

* improve coverage

* improve python coverage
---
 .../fluid/framework/details/build_strategy.h  |   2 +
 .../details/scale_loss_grad_op_handle.cc      |  19 ++-
 .../details/scale_loss_grad_op_handle.h       |   6 +
 .../scope_buffered_ssa_graph_executor.cc      |  53 ++++---
 .../scope_buffered_ssa_graph_executor.h       |   2 +-
 .../framework/distributed_strategy.proto      |   1 +
 .../multi_devices_graph_pass/CMakeLists.txt   |   2 +-
 .../modify_op_lock_and_record_event_pass.cc   |  14 +-
 paddle/fluid/framework/parallel_executor.cc   | 143 ++++++++++++++++++
 paddle/fluid/framework/parallel_executor.h    |   2 +
 paddle/fluid/operators/conv_cudnn_helper.h    |   3 +
 paddle/fluid/platform/cuda_graph.cc           |  12 ++
 paddle/fluid/platform/cuda_graph.h            |  10 +-
 .../platform/cuda_graph_with_memory_pool.cc   |   9 +-
 paddle/fluid/platform/gpu_info.cc             |   2 +-
 paddle/fluid/pybind/pybind.cc                 |  27 +++-
 python/paddle/fluid/executor.py               |  12 +-
 .../fluid/tests/unittests/test_cuda_graph.py  |  91 ++++++++++-
 18 files changed, 368 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 0629f1b91504a..25110fe24f587 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -143,6 +143,8 @@ struct BuildStrategy {
   // Turn off inplace addto by default.
   bool enable_addto_{false};
 
+  bool allow_cuda_graph_capture_{false};
+
   // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
   // num_trainers is 1, so the current fields of build_strategy doesn't tell if
   // it's distributed model.
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index c0c3e14c8bf23..1e3cd4f0aa77c 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -86,19 +86,28 @@ struct ScaleLossGradFunctor {
   }
 };
 
+std::string ScaleLossGradOpHandle::LossGradName() const {
+  return static_cast<VarHandle *>(this->outputs_[0])->name();
+}
+
 void ScaleLossGradOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name());
-  // Doesn't wait any event
-  std::string var_name = static_cast<VarHandle *>(this->outputs_[0])->name();
+  RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true);
+}
 
-  auto *tensor =
-      local_exec_scopes_[0]->FindVar(var_name)->GetMutable<LoDTensor>();
+void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
+  auto *tensor = var->GetMutable<LoDTensor>();
   tensor->Resize(make_ddim({1}));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_,
                             this->dev_ctxes_.at(place_));
-  this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); });
+  if (record_event) {
+    this->RunAndRecordEvent(
+        [&] { framework::VisitDataType(out_dtype_, func); });
+  } else {
+    framework::VisitDataType(out_dtype_, func);
+  }
 #else
   ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr);
   framework::VisitDataType(out_dtype_, func);
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
index 02e5aa88443df..88fe02a749fe4 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h
@@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase {
 
   std::string Name() const override;
 
+  platform::Place GetPlace() const { return place_; }
+
+  void RunOnVar(Variable *var, bool record_event = false);
+
+  std::string LossGradName() const;
+
  protected:
   void RunImpl() override;
 
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index ad47846c59a05..5d271d06b6922 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -22,7 +22,9 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/profiler.h"
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
   PrepareLocalExeScopes();
 }
 
+static void RunProgramDescs(const ProgramDescs &programs,
+                            const std::vector<Scope *> &local_exec_scopes,
+                            const std::vector<platform::Place> &places) {
+  for (auto &program : programs) {
+    for (auto &op_desc : program.Block(0).AllOps()) {
+      for (size_t i = 0; i < local_exec_scopes.size(); ++i) {
+        auto op = OpRegistry::CreateOp(*op_desc);
+        op->Run(*local_exec_scopes[i], places[i]);
+      }
+    }
+  }
+}
+
 FetchResultType ScopeBufferedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    strategy_.num_iteration_per_drop_scope_ =
+        std::numeric_limits<size_t>::max();
+    DropLocalExeScopes(/*need_wait=*/false);
+  }
+#endif
+
   if (drop_scope_counter_ == 0) {
     platform::RecordEvent e("InitLocalVars");
     InitVariables();
@@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
   ++drop_scope_counter_;
   if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ ||
       DropScopeOrNot()) {
-    DropLocalExeScopes();
+    DropLocalExeScopes(!platform::IsCUDAGraphCapturing());
   }
 
   if (VLOG_IS_ON(5)) {
@@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
     if (graph.Has(details::kStartupProgramDescs)) {
       auto &program_descs =
           graph.Get<details::ProgramDescs>(details::kStartupProgramDescs);
-
-      for (auto &program_desc : program_descs) {
-        for (auto &op_desc : program_desc.Block(0).AllOps()) {
-          for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-            auto op = OpRegistry::CreateOp(*op_desc);
-            op->Run(*local_exec_scopes_[i], places_[i]);
-          }
-        }
-      }
+      RunProgramDescs(program_descs, local_exec_scopes_, places_);
     }
     is_initialized_ = true;
   }
@@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
   if (graph.Has(details::kProgramDescs)) {
     auto &program_descs =
         graph.Get<details::ProgramDescs>(details::kProgramDescs);
-
-    for (auto &program_desc : program_descs) {
-      for (auto &op_desc : program_desc.Block(0).AllOps()) {
-        for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
-          auto op = OpRegistry::CreateOp(*op_desc);
-          op->Run(*local_exec_scopes_[i], places_[i]);
-        }
-      }
-    }
+    RunProgramDescs(program_descs, local_exec_scopes_, places_);
   }
 }
 
-void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() {
+void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
   platform::RecordEvent drop_scope_event("DropLocalExeScopes");
   drop_scope_counter_ = 0;
-  for (auto &p : places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  if (need_wait) {
+    for (auto &p : places_) {
+      platform::DeviceContextPool::Instance().Get(p)->Wait();
+    }
   }
   scope_monitor_.ClearHistoryLocalExecScopes();
   for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
index aa2b113c960a3..ea5a3c07957bf 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h
@@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor {
   FetchResultType Run(const std::vector<std::string>& fetch_tensors,
                       bool return_merged) override;
 
-  void DropLocalExeScopes();
+  void DropLocalExeScopes(bool need_wait = true);
 
   bool NeedCreateLocalExeScope();
 
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 17d15a94c7287..e7a25de96a947 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -115,6 +115,7 @@ message BuildStrategy {
   optional bool enable_auto_fusion = 11 [ default = false ];
   optional bool enable_addto = 12 [ default = false ];
   optional bool fix_op_run_order = 13 [ default = false ];
+  optional bool allow_cuda_graph_capture = 14 [ default = false ];
 }
 
 message ExecutionStrategy {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
index 6764799d82866..fea12baf0651f 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
+cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper)
 
 cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper)
 cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper)
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
index 70b95c9154fd3..afd80e45cf65e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h"
 
@@ -21,14 +22,23 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+template <typename T>
+static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base,
+                                         const platform::Place &place) {
+  auto *op = dynamic_cast<T *>(op_base);
+  return op && op->GetPlace() == place;
+}
+
 static bool IsLockAndRecordEventFreeComputationOpHandle(
     details::ComputationOpHandle *op, const OpGraphView &graph_view) {
   if (!platform::is_gpu_place(op->GetPlace()) &&
       !platform::is_xpu_place(op->GetPlace()))
     return false;
   for (auto &pending_op : graph_view.PendingOps(op)) {
-    auto *tmp = dynamic_cast<details::ComputationOpHandle *>(pending_op);
-    if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
+    if (!IsMatchedPlaceSingleDeviceOp<details::ComputationOpHandle>(
+            pending_op, op->GetPlace()) &&
+        !IsMatchedPlaceSingleDeviceOp<details::ScaleLossGradOpHandle>(
+            pending_op, op->GetPlace())) {
       return false;
     }
   }
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index adbbfb380bc45..d19ac0b65f4d1 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -34,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -43,6 +45,10 @@ limitations under the License. */
 
 DECLARE_double(eager_delete_tensor_gb);
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+DECLARE_bool(sync_nccl_allreduce);
+#endif
+
 #ifdef WITH_GPERFTOOLS
 #include "gperftools/profiler.h"
 #endif
@@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   // ncclOp
   std::vector<ir::Graph *> async_graphs =
       CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
+  PrepareForCUDAGraphCapture(graph);
   graph = member_->ApplyMemoryOptimizePass(graph);
   async_graphs[0] = graph;
 
@@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices(
 FetchResultType ParallelExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
   VLOG(3) << "enter ParallelExecutor Run";
+#ifdef PADDLE_WITH_CUDA
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "Cannot fetch data when using CUDA Graph."));
+    PADDLE_ENFORCE_EQ(
+        member_->build_strategy_.allow_cuda_graph_capture_, true,
+        platform::errors::InvalidArgument(
+            "You must turn on build_strategy.allow_cuda_graph_capture = True "
+            "to enable CUDA Graph capturing."));
+    PADDLE_ENFORCE_EQ(
+        member_->places_[0], platform::CUDAGraphCapturingPlace(),
+        platform::errors::InvalidArgument("The place to capture CUDAGraph is "
+                                          "not the same as the place to run."));
+  }
+#endif
+
 #ifdef WITH_GPERFTOOLS
   if (gProfileStarted) {
     ProfilerFlush();
@@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse(
 
 void ParallelExecutor::FeedTensorsIntoLocalScopes(
     const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors) {
+  if (platform::IsCUDAGraphCapturing()) {
+    for (auto &tensor : tensors) {
+      PADDLE_ENFORCE_EQ(
+          tensor.empty(), true,
+          platform::errors::PermissionDenied(
+              "Feeding data is not permitted when capturing CUDA Graph."));
+    }
+    return;
+  }
+
   if (!member_->AllowPartialFeed()) {
     PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(),
                       platform::errors::Unimplemented(
@@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
 
 void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
     const std::unordered_map<std::string, LoDTensor> &tensors) {
+  if (platform::IsCUDAGraphCapturing()) {
+    PADDLE_ENFORCE_EQ(
+        tensors.empty(), true,
+        platform::errors::PermissionDenied(
+            "Feeding data is not permitted when capturing CUDA Graph."));
+    return;
+  }
+
   size_t num_places = member_->places_.size();
   bool allow_partial_feed = member_->AllowPartialFeed();
 
@@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const {
   return member_->executor_->Graph();
 }
 
+void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
+  const auto &build_strategy = member_->build_strategy_;
+  if (!build_strategy.allow_cuda_graph_capture_) return;
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_EQ(
+      build_strategy.async_mode_, false,
+      platform::errors::InvalidArgument(
+          "Async Executor does not support CUDA Graph capturing."));
+  PADDLE_ENFORCE_EQ(
+      platform::IsCUDAGraphCapturing(), false,
+      platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
+                                         "when running the first batch."));
+  PADDLE_ENFORCE_EQ(
+      member_->places_.size(), 1,
+      platform::errors::InvalidArgument(
+          "CUDA Graph is only supported when one GPU device is running."));
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true,
+                    platform::errors::InvalidArgument(
+                        "CUDA Graph is only supported on NVIDIA GPU device."));
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false,
+                    platform::errors::InvalidArgument(
+                        "FLAGS_sync_nccl_allreduce must be False to support "
+                        "CUDA Graph capturing."));
+
+  std::unordered_map<std::string, std::vector<VarDesc *>> all_vars;
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      auto *var_desc = node->Var();
+      all_vars[var_desc->Name()].emplace_back(var_desc);
+    }
+  }
+
+  auto mark_var_as_persistable = [&all_vars](const std::string &name) {
+    auto iter = all_vars.find(name);
+    if (iter != all_vars.end()) {
+      for (auto *var_desc : iter->second) {
+        var_desc->SetPersistable(true);
+      }
+    }
+  };
+
+  // Step 1: All fused vars must be persistable.
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      fused_var.second.persistable_ = true;
+      mark_var_as_persistable(fused_var.first);
+    }
+  }
+
+  // Step 2: All pinned vars must be persistable.
+  if (graph->Has(details::kPinnedVars)) {
+    auto &pinned_vars = graph->Get<details::PinnedVars>(details::kPinnedVars);
+    for (auto &pinned_var : pinned_vars) {
+      mark_var_as_persistable(pinned_var);
+    }
+  }
+
+  // Step 3: Move all main programs to startup programs to make sure that
+  // the main programs would only be run once.
+  if (graph->Has(details::kProgramDescs)) {
+    auto &startup_programs =
+        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
+    auto &main_programs =
+        graph->Get<details::ProgramDescs>(details::kProgramDescs);
+    for (auto &main_program : main_programs) {
+      startup_programs.emplace_back(main_program);
+    }
+    graph->Erase(details::kProgramDescs);
+  }
+
+  // Step 4: Mark all vars in startup programs to be persistable.
+  if (graph->Has(details::kStartupProgramDescs)) {
+    auto &startup_programs =
+        graph->GetOrInit<details::ProgramDescs>(details::kStartupProgramDescs);
+    for (auto &startup_program : startup_programs) {
+      for (auto &op_desc : startup_program.Block(0).AllOps()) {
+        for (auto &output : op_desc->OutputArgumentNames()) {
+          mark_var_as_persistable(output);
+        }
+      }
+    }
+  }
+
+  // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy.
+  auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*graph);
+  auto *scope = member_->local_scopes_[0];
+  for (auto *op : ops) {
+    auto *loss_grad_op = dynamic_cast<details::ScaleLossGradOpHandle *>(op);
+    if (loss_grad_op == nullptr) continue;
+    auto loss_grad_name = loss_grad_op->LossGradName();
+    mark_var_as_persistable(loss_grad_name);
+    loss_grad_op->RunOnVar(scope->Var(loss_grad_name));
+    loss_grad_op->SetSkipRunning(true);
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "CUDA Graph is only supported on NVIDIA GPU device."));
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 6c871a8d85815..78774f0489638 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -144,6 +144,8 @@ class ParallelExecutor {
   void SetReaderOpDeviceInfoOfGraphs(
       const std::vector<ir::Graph *> &final_graphs);
 
+  void PrepareForCUDAGraphCapture(ir::Graph *graph);
+
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
   std::vector<VariableInfo> var_infos_;
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 4c0ef02074e2e..f4183bf570926 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 namespace paddle {
 namespace operators {
@@ -480,6 +481,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   static algo_t Find(const ConvArgs& args, bool exhaustive_search,
                      bool deterministic,
                      const framework::ExecutionContext& ctx) {
+    platform::CUDAGraphCaptureModeGuard guard;
     auto dtype = platform::CudnnDataType<T>::type;
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
@@ -601,6 +603,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 
   static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) {
+    platform::CUDAGraphCaptureModeGuard guard;
     size_t workspace_size = 0;
     PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc
index 6e518d779e9cd..693a592799027 100644
--- a/paddle/fluid/platform/cuda_graph.cc
+++ b/paddle/fluid/platform/cuda_graph.cc
@@ -70,6 +70,9 @@ void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream,
   cudaStreamCaptureStatus status;
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo(
       capturing_graph_->stream_, &status, &(capturing_graph_->id_)));
+  PADDLE_ENFORCE_EQ(IsValidCapturing(), true,
+                    platform::errors::PermissionDenied(
+                        "CUDA Graph should not be invalidated."));
   VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_;
 }
 
@@ -88,5 +91,14 @@ std::unique_ptr<CUDAGraph> CUDAGraph::EndCapture() {
 #endif
 }
 
+bool CUDAGraph::IsValidCapturing() {
+  if (!IsCapturing()) return false;
+  cudaStreamCaptureStatus status;
+  CUDAGraphID id;
+  PADDLE_ENFORCE_CUDA_SUCCESS(
+      cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id));
+  return status == cudaStreamCaptureStatusActive;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h
index 41e36049aa1a0..55ec463556b45 100644
--- a/paddle/fluid/platform/cuda_graph.h
+++ b/paddle/fluid/platform/cuda_graph.h
@@ -84,6 +84,10 @@ class CUDAGraph {
     return capturing_graph_->place_;
   }
 
+  // This API can be used to debug which GPU operation is not
+  // supported during capturing CUDA Graph.
+  static bool IsValidCapturing();
+
  private:
 #if CUDA_VERSION >= 10010
   cudaGraph_t graph_{nullptr};
@@ -104,7 +108,8 @@ class CUDAGraphCaptureModeGuard {
   DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
 
  public:
-  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode mode) {
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {
     if (UNLIKELY(CUDAGraph::IsCapturing())) {
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode));
       // After cudaThreadExchangeStreamCaptureMode is called,
@@ -128,7 +133,8 @@ class CUDAGraphCaptureModeGuard {
   DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard);
 
  public:
-  explicit CUDAGraphCaptureModeGuard(cudaStreamCaptureMode) {}
+  explicit CUDAGraphCaptureModeGuard(
+      cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {}
 };
 #endif
 
diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
index 1f0d39e2abe23..4804d3f6ed301 100644
--- a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
+++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc
@@ -22,8 +22,10 @@ namespace platform {
 #ifdef PADDLE_WITH_CUDA
 void BeginCUDAGraphCapture(platform::CUDAPlace place,
                            cudaStreamCaptureMode mode) {
-  auto stream =
-      platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
+  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+
+  auto stream = dev_ctx->stream();
   CUDAGraph::BeginCapture(place, stream, mode);
   auto id = CUDAGraph::CapturingID();
   memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph(
@@ -35,6 +37,9 @@ void BeginCUDAGraphCapture(platform::CUDAPlace place,
 }
 
 std::unique_ptr<CUDAGraph> EndCUDAGraphCapture() {
+  auto place = CUDAGraph::CapturingPlace();
+  auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+  dev_ctx->cudnn_workspace_handle().ResetWorkspace();
   return CUDAGraph::EndCapture();
 }
 #endif
diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index 59e4404ffe535..c624ba94b74a3 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -558,7 +558,7 @@ class RecordedCudaMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto result = hipMalloc(ptr, size);
 #else
-    CUDAGraphCaptureModeGuard capture_mode_guard{cudaStreamCaptureModeRelaxed};
+    CUDAGraphCaptureModeGuard capture_mode_guard;
     auto result = cudaMalloc(ptr, size);
 #endif
     if (result == gpuSuccess) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 6b24c64492581..f58c2a5db381c 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -736,6 +736,17 @@ PYBIND11_MODULE(core_noavx, m) {
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
+      .def("_copy_from",
+           [](framework::Tensor &self, const framework::Tensor &other,
+              const platform::Place &place, int64_t batch_size) {
+             if (batch_size < 0) {
+               framework::TensorCopy(other, place, &self);
+             } else {
+               auto sliced = other.Slice(0, batch_size);
+               framework::TensorCopy(sliced, place, &self);
+             }
+           },
+           py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
@@ -2299,7 +2310,14 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
-  m.def("cuda_empty_cache", platform::EmptyCache);
+  m.def("cuda_empty_cache", [] {
+    for (int dev_id : platform::GetSelectedDevices()) {
+      auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(
+          platform::CUDAPlace(dev_id));
+      dev_ctx->cudnn_workspace_handle().ResetWorkspace();
+    }
+    platform::EmptyCache();
+  });
   m.def("get_device_properties",
         [](int id) -> const gpuDeviceProp & {
           return platform::GetDeviceProperties(id);
@@ -3211,6 +3229,13 @@ All parameter, weight, gradient are variables in Paddle.
           [](BuildStrategy &self, bool fix_op_run_order) {
             self.fix_op_run_order_ = fix_op_run_order;
           })
+      .def_property("allow_cuda_graph_capture",
+                    [](const BuildStrategy &self) {
+                      return self.allow_cuda_graph_capture_;
+                    },
+                    [](BuildStrategy &self, bool allow_cuda_graph_capture) {
+                      self.allow_cuda_graph_capture_ = allow_cuda_graph_capture;
+                    })
       .def("_copy",
            [](const BuildStrategy &self) {
              auto new_bs = self;
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 4c7537d8d5c8e..8c118f31cbe87 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1044,9 +1044,15 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
             lr_value = lr_sheduler()
             lr_var = program._program.global_block().vars[lr_sheduler._var_name]
             lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype)
-            exe.feed_and_split_tensor_into_local_scopes({
-                lr_sheduler._var_name: lr_tensor
-            })
+            if core.is_cuda_graph_capturing():
+                warnings.warn(
+                    "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not "
+                    "take any effect! Please set the learning rate manually before each batch!"
+                )
+            else:
+                exe.feed_and_split_tensor_into_local_scopes({
+                    lr_sheduler._var_name: lr_tensor
+                })
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
index 272d68e17fcc4..7d1317473531e 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py
@@ -17,18 +17,105 @@
 from paddle.device.cuda.graphs import CUDAGraph
 import unittest
 import numpy as np
+from paddle.fluid.dygraph.base import switch_to_static_graph
+from simple_nets import simple_fc_net_with_inputs
 
 
 class TestCUDAGraph(unittest.TestCase):
     def setUp(self):
-        fluid.set_flags({'FLAGS_allocator_strategy': 'auto_growth'})
+        if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm(
+        ):
+            fluid.set_flags({
+                'FLAGS_allocator_strategy': 'auto_growth',
+                'FLAGS_sync_nccl_allreduce': False,
+                'FLAGS_cudnn_deterministic': True
+            })
 
     def random_tensor(self, shape):
         return paddle.to_tensor(
             np.random.randint(
                 low=0, high=10, size=shape).astype("float32"))
 
-    def test_cuda_graph(self):
+    @switch_to_static_graph
+    def test_cuda_graph_static_graph(self):
+        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
+            return
+
+        seed = 100
+        loss_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=True)
+        loss_no_cuda_graph = self.cuda_graph_static_graph_main(
+            seed, use_cuda_graph=False)
+        self.assertEqual(loss_cuda_graph, loss_no_cuda_graph)
+
+    def cuda_graph_static_graph_main(self, seed, use_cuda_graph):
+        batch_size = 1
+        class_num = 10
+        image_shape = [batch_size, 784]
+        label_shape = [batch_size, 1]
+
+        paddle.seed(seed)
+        np.random.seed(seed)
+        startup = paddle.static.Program()
+        main = paddle.static.Program()
+        with paddle.static.program_guard(main, startup):
+            image = paddle.static.data(
+                name="image", shape=image_shape, dtype='float32')
+            label = paddle.static.data(
+                name="label", shape=label_shape, dtype='int64')
+            image.persistable = True
+            label.persistable = True
+            loss = simple_fc_net_with_inputs(image, label, class_num)
+            loss.persistable = True
+            lr = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04])
+            optimizer = paddle.optimizer.SGD(learning_rate=lr)
+            optimizer.minimize(loss)
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        scope = paddle.static.Scope()
+        with paddle.static.scope_guard(scope):
+            exe.run(startup)
+            build_strategy = paddle.static.BuildStrategy()
+            build_strategy.allow_cuda_graph_capture = True
+            build_strategy.fix_op_run_order = True
+            build_strategy.fuse_all_optimizer_ops = True
+            compiled_program = paddle.static.CompiledProgram(
+                main).with_data_parallel(
+                    loss_name=loss.name,
+                    build_strategy=build_strategy,
+                    places=place)
+            image_t = scope.var(image.name).get_tensor()
+            label_t = scope.var(label.name).get_tensor()
+            loss_t = scope.var(loss.name).get_tensor()
+            lr_var = main.global_block().var(lr._var_name)
+            self.assertTrue(lr_var.persistable)
+            lr_t = scope.var(lr_var.name).get_tensor()
+            cuda_graph = None
+            for batch_id in range(20):
+                image_t.set(
+                    np.random.rand(*image_shape).astype('float32'), place)
+                label_t.set(np.random.randint(
+                    low=0, high=class_num, size=label_shape, dtype='int64'),
+                            place)
+
+                if batch_id == 1 and use_cuda_graph:
+                    cuda_graph = CUDAGraph(place, mode="global")
+                    cuda_graph.capture_begin()
+                    exe.run(compiled_program)
+                    cuda_graph.capture_end()
+
+                if cuda_graph:
+                    lr_t.set(np.array([lr()], dtype='float32'), place)
+                    cuda_graph.replay()
+                else:
+                    exe.run(compiled_program)
+                lr.step()
+            if cuda_graph:
+                cuda_graph.reset()
+        return np.array(loss_t)
+
+    def test_cuda_graph_dynamic_graph(self):
         if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
             return
 

From 57e8cbecaf06a54686f9aa28f2a8a84d32dcae6f Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Fri, 8 Oct 2021 17:29:51 +0200
Subject: [PATCH 28/80] Fix for oneDNN conv op (#36284)

* fix for conv op

* Minor change
---
 paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index c663ba2f88680..cce835e6bc035 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -217,9 +217,10 @@ class ConvMKLDNNHandlerT
       const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
                                          : mkldnn::prop_kind::forward_training;
 
-      float sum_scale;
+      float sum_scale = 1.0f;
       std::vector<float> output_shift_scale;
-      std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx);
+      if (platform::is_int8<T>())
+        std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx);
 
       const mkldnn::primitive_attr conv_attr = CreatePostOps(
           fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,

From d8887afaf0d4ae9bb30831f58cd5eb62e3f63e0a Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Sat, 9 Oct 2021 10:08:52 +0800
Subject: [PATCH 29/80] fix hasattr(paddle.fluid.ir.PassDesc.OP, '__name__')
 error (#36229)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

对于__getattr__重载后不满足条件的参数，全部抛出AttributeError异常，达到与未重载版本一致。
---
 python/paddle/fluid/ir.py                              | 10 ++++++----
 .../fluid/tests/unittests/ir/test_ir_generate_pass.py  |  3 +++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py
index 17b7ea1122ab7..7e2d3df1ce1e4 100644
--- a/python/paddle/fluid/ir.py
+++ b/python/paddle/fluid/ir.py
@@ -230,9 +230,6 @@ def __init__(self, type=None):
             self._type = type
 
         def __getattr__(self, name):
-            if self._type is not None:
-                raise AttributeError(
-                    "type object 'OpHelper' has no attribute '{}'".format(name))
             op = PassDesc.OpHelper(name)
             op.Init()
             return op
@@ -261,7 +258,12 @@ def Init(self):
             self._op_idx = len(block.ops)
             self._op_desc = block.desc.append_op()
             self._op_desc.set_type(self._type)
-            self._op_proto = OpProtoHolder.instance().get_op_proto(self._type)
+            self._op_proto = OpProtoHolder.instance().op_proto_map.get(
+                self._type)
+            if self._op_proto is None:
+                raise AttributeError(
+                    "type object 'OpHelper' has no attribute '{}'".format(
+                        self._type))
             block.ops.append(self)
 
         def Attr(self, name):
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
index c8b9d5e5739dd..851ae21c38378 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py
@@ -123,6 +123,9 @@ def convert_ops_to_op_dicts(self, ops):
                 op_dicts[op.type] = [op]
         return op_dicts
 
+    def test_has_attr(self):
+        self.assertFalse(hasattr(ir.PassDesc.OP, '__name__'))
+
     def test_generate_fc_fuse(self):
         def _check_fc_fuse_pass(pass_desc, with_relu):
             pattern_op_dicts = self.convert_ops_to_op_dicts(

From 2fd8deea8d6dedd567000fb092f4c1292e6dbdc8 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Sat, 9 Oct 2021 10:09:10 +0800
Subject: [PATCH 30/80] C++ support register pass via PassDesc (#36095)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

支持C++开发注册GeneratePass，简化针对fusion等子图优化场景开发方式。
---
 paddle/fluid/framework/ir/generate_pass.cc    | 110 ++++++++
 paddle/fluid/framework/ir/generate_pass.h     | 153 +++++++++-
 .../framework/ir/generate_pass_tester.cc      | 267 ++++--------------
 3 files changed, 314 insertions(+), 216 deletions(-)

diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc
index 9eba6fc89a2e9..085298314ea3f 100644
--- a/paddle/fluid/framework/ir/generate_pass.cc
+++ b/paddle/fluid/framework/ir/generate_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/generate_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 
 namespace paddle {
 namespace framework {
@@ -224,6 +225,115 @@ bool GeneratePass::VerifyGraph(const Graph& graph) {
   return true;
 }
 
+namespace generate_pass {
+
+VarHelper::VarHelper(const char* name) : name_(name), type_(Type::kInput) {}
+VarHelper::VarHelper(const std::string& name, Type type)
+    : name_(name), type_(type) {}
+
+OpHelper::OpHelper(const char* type, SubgraphHelper* subgraph_helper)
+    : type_(type), subgraph_helper_(subgraph_helper) {
+  op_desc_ = subgraph_helper_->ProgramDesc()->mutable_blocks(0)->add_ops();
+  op_desc_->set_type(type_);
+}
+
+OpHelper::Arguments::Arguments(const char* parameter,
+                               const VarHelper& var_helper)
+    : parameter_(parameter) {
+  var_helpers_.push_back(var_helper);
+}
+
+OpHelper::Arguments::Arguments(const char* parameter,
+                               std::initializer_list<VarHelper> var_helpers)
+    : parameter_(parameter), var_helpers_(var_helpers) {}
+
+OpHelper& OpHelper::operator()(const Arguments& input) {
+  proto::OpDesc::Var* var = op_desc_->add_inputs();
+  var->set_parameter(input.parameter_);
+  for (const VarHelper& var_helper : input.var_helpers_) {
+    var->add_arguments()->assign(var_helper.name_);
+    if (VarHelper::Type::kInput == var_helper.type_) {
+      subgraph_helper_->AddInputVar(var_helper.name_);
+    }
+  }
+  return *this;
+}
+
+OpHelper& OpHelper::operator()(std::initializer_list<Arguments> inputs) {
+  for (const auto& input : inputs) {
+    operator()(input);
+  }
+  return *this;
+}
+
+VarHelper OpHelper::Out(const char* name) {
+  std::string argument = patterns::UniqueKey(type_);
+  proto::OpDesc::Var* var = op_desc_->add_outputs();
+  var->set_parameter(name);
+  var->add_arguments()->assign(argument);
+  return VarHelper(argument, VarHelper::Type::kOutput);
+}
+
+proto::ProgramDesc* SubgraphHelper::ProgramDesc() { return &program_desc_; }
+
+const proto::ProgramDesc& SubgraphHelper::ProgramDesc() const {
+  return program_desc_;
+}
+
+const std::vector<std::string>& SubgraphHelper::InputVars() const {
+  return input_vars_;
+}
+
+const std::vector<std::string>& SubgraphHelper::OutputVars() const {
+  return output_vars_;
+}
+
+void SubgraphHelper::AddInputVar(const std::string& name) {
+  auto iter = std::find(input_vars_.begin(), input_vars_.end(), name);
+  if (input_vars_.end() == iter) {
+    input_vars_.push_back(name);
+  }
+}
+
+void SubgraphHelper::AddOutputVars(const VarHelper& var_helper) {
+  output_vars_.push_back(var_helper.name_);
+}
+
+}  // namespace generate_pass
+
+PassPairs::PassPairs(const SubgraphType& pattern, const SubgraphType& replace) {
+  AddPassDesc(pattern, replace);
+}
+
+void PassPairs::AddPassDesc(const SubgraphType& pattern,
+                            const SubgraphType& replace) {
+  proto::PassDesc* pass_desc = multi_pass_desc_.add_pass_descs();
+  pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc());
+  pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc());
+  PADDLE_ENFORCE_EQ(pattern.InputVars().size(), replace.InputVars().size(),
+                    platform::errors::InvalidArgument(
+                        "Size of lambda expression arguments is not equal "
+                        "between pattern/replace subgraph."));
+  for (size_t i = 0; i < pattern.InputVars().size(); i++) {
+    proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps();
+    var_map->set_pattern_var(pattern.InputVars()[i]);
+    var_map->set_replace_var(replace.InputVars()[i]);
+  }
+  PADDLE_ENFORCE_EQ(pattern.OutputVars().size(), replace.OutputVars().size(),
+                    platform::errors::InvalidArgument(
+                        "Size of lambda expression returns is not equal "
+                        "between pattern/replace subgraph."));
+  for (size_t i = 0; i < pattern.OutputVars().size(); i++) {
+    proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps();
+    var_map->set_pattern_var(pattern.OutputVars()[i]);
+    var_map->set_replace_var(replace.OutputVars()[i]);
+  }
+}
+
+const proto::MultiPassDesc& PassPairs::MultiPassDesc() const {
+  return multi_pass_desc_;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/generate_pass.h b/paddle/fluid/framework/ir/generate_pass.h
index f73173233aed3..26e5231fbc16e 100644
--- a/paddle/fluid/framework/ir/generate_pass.h
+++ b/paddle/fluid/framework/ir/generate_pass.h
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/pass_desc.pb.h"
 
@@ -43,6 +42,158 @@ class GeneratePass : public Pass {
   proto::MultiPassDesc multi_pass_desc_;
 };
 
+namespace generate_pass {
+
+class VarHelper;
+class OpHelper;
+class SubgraphHelper;
+
+// VarHelper is used to represent a variable node.
+struct VarHelper {
+  enum class Type { kInput, kOutput };
+
+  explicit VarHelper(const char* name);
+  VarHelper(const std::string& name, Type type);
+
+  std::string name_;
+  Type type_;
+};
+
+// OpHelper is used to represent a operator node.
+class OpHelper {
+ public:
+  // Convert multiple inputs.
+  struct Arguments {
+    Arguments(const char* parameter, const VarHelper& var_helper);
+    Arguments(const char* parameter,
+              std::initializer_list<VarHelper> var_helpers);
+
+    std::string parameter_;
+    std::vector<VarHelper> var_helpers_;
+  };
+
+  OpHelper(const char* type, SubgraphHelper* subgraph_helper);
+
+  OpHelper& operator()(const Arguments& input);
+  OpHelper& operator()(std::initializer_list<Arguments> inputs);
+
+  VarHelper Out(const char* name);
+
+ private:
+  OpHelper() = delete;
+  DISABLE_COPY_AND_ASSIGN(OpHelper);
+
+  const char* type_;
+  proto::OpDesc* op_desc_;
+  SubgraphHelper* subgraph_helper_;
+};
+
+/*
+ * SubgraphHelper is used to define pattern/replace subgraphs.
+ *
+ * Use lambda expression to define subgraph like Python. SubgraphHelper
+ * converts lambda expression to ProgramDesc.
+ *
+ * In order to define a subgraph, user need to use VarHelper and OpHelper.
+ * Use the macros instead of class names, so user can develop better and
+ * don't need to know too much about underlying implementation.
+ *
+ * An example of defining a subgraph as follows:
+ *
+ *   SUBGRAPH_(subgraph)([subgraph=&subgraph](VAR_(x), VAR_(y), VAR_(z)) {
+ *     auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out");
+ *     auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out");
+ *     return ewadd2;
+ *   });
+ *
+ */
+class SubgraphHelper {
+ public:
+  SubgraphHelper() = default;
+  // The lambda expression is a prvalue expression.
+  template <typename T>
+  SubgraphHelper& operator=(const T&& f) {
+    proto::BlockDesc* block = program_desc_.add_blocks();
+    block->set_idx(0);
+    block->set_parent_idx(0);
+    AddOutputVars(f());
+    return *this;
+  }
+
+  proto::ProgramDesc* ProgramDesc();
+  const proto::ProgramDesc& ProgramDesc() const;
+  const std::vector<std::string>& InputVars() const;
+  const std::vector<std::string>& OutputVars() const;
+
+  void AddInputVar(const std::string& name);
+
+  void AddOutputVars(const VarHelper& var_helper);
+
+  template <size_t i, typename... Ts,
+            std::enable_if_t<i + 1 < sizeof...(Ts)>* = nullptr>
+  void AddOutputVars(const std::tuple<Ts...>& outputs) {
+    AddOutputVars(std::get<i>(outputs));
+    AddOutputVars<i + 1>(outputs);
+  }
+
+  template <size_t i, typename... Ts,
+            std::enable_if_t<i + 1 == sizeof...(Ts)>* = nullptr>
+  void AddOutputVars(const std::tuple<Ts...>& outputs) {
+    AddOutputVars(std::get<i>(outputs));
+  }
+
+  template <typename... Ts>
+  void AddOutputVars(const std::tuple<Ts...>& outputs) {
+    AddOutputVars<0>(outputs);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(SubgraphHelper);
+  std::vector<std::string> input_vars_;
+  std::vector<std::string> output_vars_;
+  proto::ProgramDesc program_desc_;
+};
+
+}  // namespace generate_pass
+
+class PassPairs {
+ public:
+  using SubgraphType = generate_pass::SubgraphHelper;
+
+  PassPairs() = default;
+  PassPairs(const SubgraphType& pattern, const SubgraphType& replace);
+
+  void AddPassDesc(const SubgraphType& pattern, const SubgraphType& replace);
+
+  const proto::MultiPassDesc& MultiPassDesc() const;
+
+ private:
+  proto::MultiPassDesc multi_pass_desc_;
+};
+
+// Use function to register in CC.
+template <PassPairs (*Functor)(void)>
+class MacroPassHelper : public GeneratePass {
+ public:
+  MacroPassHelper() : GeneratePass(Functor().MultiPassDesc()) {}
+};
+
+#define VAR_(name)                                         \
+  ::paddle::framework::ir::generate_pass::VarHelper name = \
+      ::paddle::framework::ir::generate_pass::VarHelper(#name)
+#define OP_(type) \
+  ::paddle::framework::ir::generate_pass::OpHelper(#type, subgraph)
+#define SUBGRAPH_(name)                                        \
+  ::paddle::framework::ir::generate_pass::SubgraphHelper name; \
+  name
+
+#define REGISTER_GENERATE_PASS(pass_type)                               \
+  paddle::framework::ir::PassPairs register_##pass_type();              \
+  REGISTER_PASS(                                                        \
+      pass_type,                                                        \
+      ::paddle::framework::ir::MacroPassHelper<&register_##pass_type>); \
+  paddle::framework::ir::PassPairs register_##pass_type()
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc
index c3852d29c308f..6876dde50c157 100644
--- a/paddle/fluid/framework/ir/generate_pass_tester.cc
+++ b/paddle/fluid/framework/ir/generate_pass_tester.cc
@@ -16,234 +16,71 @@
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 
-namespace paddle {
-namespace framework {
-namespace ir {
-
-template <proto::MultiPassDesc (*Functor)(void)>
-class CXXGeneratePass : public GeneratePass {
- public:
-  CXXGeneratePass() : GeneratePass(Functor()) {}
-};
-
-#define REGISTER_GENERATE_PASS(pass_type, function) \
-  REGISTER_PASS(pass_type, ::paddle::framework::ir::CXXGeneratePass<&function>)
-
-proto::MultiPassDesc generate_fc_fuse() {
-  proto::MultiPassDesc multi_pass_desc;
+REGISTER_GENERATE_PASS(generate_fc_fuse) {
+  paddle::framework::ir::PassPairs pass_pairs;
   for (bool with_relu : {true, false}) {
-    proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs();
-    proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks();
-    pattern->set_idx(0);
-    pattern->set_parent_idx(0);
-    proto::OpDesc* mul = pattern->add_ops();
-    mul->set_type("mul");
-    proto::OpDesc::Var* mul_x = mul->add_inputs();
-    mul_x->set_parameter("X");
-    mul_x->add_arguments()->assign("x");
-    proto::OpDesc::Var* mul_y = mul->add_inputs();
-    mul_y->set_parameter("Y");
-    mul_y->add_arguments()->assign("w");
-    proto::OpDesc::Var* mul_out = mul->add_outputs();
-    mul_out->set_parameter("Out");
-    mul_out->add_arguments()->assign("mul_out");
-    proto::OpDesc* ewadd = pattern->add_ops();
-    ewadd->set_type("elementwise_add");
-    proto::OpDesc::Var* ewadd_x = ewadd->add_inputs();
-    ewadd_x->set_parameter("X");
-    ewadd_x->add_arguments()->assign("mul_out");
-    proto::OpDesc::Var* ewadd_y = ewadd->add_inputs();
-    ewadd_y->set_parameter("Y");
-    ewadd_y->add_arguments()->assign("b");
-    proto::OpDesc::Var* ewadd_out = ewadd->add_outputs();
-    ewadd_out->set_parameter("Out");
-    ewadd_out->add_arguments()->assign("ewadd_out");
-    proto::OpDesc* relu = nullptr;
-    proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks();
-    replace->set_idx(0);
-    replace->set_parent_idx(0);
-    proto::OpDesc* fc = replace->add_ops();
-    fc->set_type("fc");
-    proto::OpDesc::Var* fc_x = fc->add_inputs();
-    fc_x->set_parameter("Input");
-    fc_x->add_arguments()->assign("x");
-    proto::OpDesc::Var* fc_w = fc->add_inputs();
-    fc_w->set_parameter("W");
-    fc_w->add_arguments()->assign("w");
-    proto::OpDesc::Var* fc_b = fc->add_inputs();
-    fc_b->set_parameter("Bias");
-    fc_b->add_arguments()->assign("b");
-    proto::OpDesc::Var* fc_out = fc->add_outputs();
-    fc_out->set_parameter("Out");
-    fc_out->add_arguments()->assign("fc_out");
-    for (const char* var : {"x", "w", "b", "fc_out"}) {
-      proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps();
-      var_map->set_pattern_var(var);
-      var_map->set_replace_var(var);
-    }
-    proto::PassDesc::AttrMap* attr_map = pass_desc->add_attr_maps();
-    attr_map->set_pattern_op_idx(0);
-    attr_map->set_pattern_name("x_num_col_dims");
-    attr_map->set_replace_op_idx(0);
-    attr_map->set_replace_name("in_num_col_dims");
-    if (with_relu) {
-      relu = pattern->add_ops();
-      relu->set_type("relu");
-      proto::OpDesc::Var* relu_x = relu->add_inputs();
-      relu_x->set_parameter("X");
-      relu_x->add_arguments()->assign("ewadd_out");
-      proto::OpDesc::Var* relu_out = relu->add_outputs();
-      relu_out->set_parameter("Out");
-      relu_out->add_arguments()->assign("relu_out");
-      pass_desc->mutable_var_maps(3)->set_pattern_var("relu_out");
-      proto::OpDesc::Attr* attr = fc->add_attrs();
-      attr->set_name("activation_type");
-      attr->set_type(proto::AttrType::STRING);
-      attr->set_s("relu");
-    } else {
-      pass_desc->mutable_var_maps(3)->set_pattern_var("ewadd_out");
-    }
+    // pattern
+    SUBGRAPH_(pattern) =
+        [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+      VLOG(3) << "exec lambda func.";
+      auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out");
+      auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out");
+      if (with_relu) {
+        return OP_(relu)({"X", ewadd}).Out("Out");
+      } else {
+        return ewadd;
+      }
+    };
+    // replace
+    SUBGRAPH_(replace) =
+        [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) {
+      auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}});
+      return fc.Out("Out");
+    };
+    pass_pairs.AddPassDesc(pattern, replace);
   }
-  return multi_pass_desc;
+  return pass_pairs;
 }
 
-proto::MultiPassDesc generate_multi_add_to_addn() {
-  proto::MultiPassDesc multi_pass_desc;
-  proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs();
-  proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks();
-  proto::OpDesc* ewadd_0 = pattern->add_ops();
-  ewadd_0->set_type("elementwise_add");
-  proto::OpDesc::Var* ewadd_0_x = ewadd_0->add_inputs();
-  ewadd_0_x->set_parameter("X");
-  ewadd_0_x->add_arguments()->assign("a");
-  proto::OpDesc::Var* ewadd_0_y = ewadd_0->add_inputs();
-  ewadd_0_y->set_parameter("Y");
-  ewadd_0_y->add_arguments()->assign("b");
-  proto::OpDesc::Var* ewadd_0_out = ewadd_0->add_outputs();
-  ewadd_0_out->set_parameter("Out");
-  ewadd_0_out->add_arguments()->assign("ewadd_out_0");
-  proto::OpDesc* ewadd_1 = pattern->add_ops();
-  ewadd_1->set_type("elementwise_add");
-  proto::OpDesc::Var* ewadd_1_x = ewadd_1->add_inputs();
-  ewadd_1_x->set_parameter("X");
-  ewadd_1_x->add_arguments()->assign("ewadd_out_0");
-  proto::OpDesc::Var* ewadd_1_y = ewadd_1->add_inputs();
-  ewadd_1_y->set_parameter("Y");
-  ewadd_1_y->add_arguments()->assign("c");
-  proto::OpDesc::Var* ewadd_1_out = ewadd_1->add_outputs();
-  ewadd_1_out->set_parameter("Out");
-  ewadd_1_out->add_arguments()->assign("ewadd_out_1");
-  proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks();
-  proto::OpDesc* addn = replace->add_ops();
-  addn->set_type("add_n");
-  proto::OpDesc::Var* addn_x = addn->add_inputs();
-  addn_x->set_parameter("X");
-  addn_x->add_arguments()->assign("a");
-  addn_x->add_arguments()->assign("b");
-  addn_x->add_arguments()->assign("c");
-  proto::OpDesc::Var* addn_out = addn->add_outputs();
-  addn_out->set_parameter("Out");
-  addn_out->add_arguments()->assign("addn_out");
-  for (const char* var : {"a", "b", "c", "ewadd_out_1"}) {
-    proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps();
-    var_map->set_pattern_var(var);
-    var_map->set_replace_var(var);
-  }
-  pass_desc->mutable_var_maps(3)->set_replace_var("addn_out");
-  return multi_pass_desc;
+REGISTER_GENERATE_PASS(generate_multi_add_to_addn) {
+  // pattern
+  SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) {
+    auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out");
+    auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out");
+    return ewadd2;
+  };
+  // replace
+  SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) {
+    return OP_(sum)({"X", {x, y, z}}).Out("Out");
+  };
+  return {pattern, replace};
 }
 
-proto::MultiPassDesc generate_combine_matmul() {
-  proto::MultiPassDesc multi_pass_desc;
-  proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs();
-  proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks();
-  proto::OpDesc* matmul_0 = pattern->add_ops();
-  matmul_0->set_type("matmul");
-  proto::OpDesc::Var* matmul_0_x = matmul_0->add_inputs();
-  matmul_0_x->set_parameter("X");
-  matmul_0_x->add_arguments()->assign("a");
-  proto::OpDesc::Var* matmul_0_y = matmul_0->add_inputs();
-  matmul_0_y->set_parameter("Y");
-  matmul_0_y->add_arguments()->assign("b");
-  proto::OpDesc::Var* matmul_0_out = matmul_0->add_outputs();
-  matmul_0_out->set_parameter("Out");
-  matmul_0_out->add_arguments()->assign("matmul_out_0");
-  proto::OpDesc* matmul_1 = pattern->add_ops();
-  matmul_1->set_type("matmul");
-  proto::OpDesc::Var* matmul_1_x = matmul_1->add_inputs();
-  matmul_1_x->set_parameter("X");
-  matmul_1_x->add_arguments()->assign("a");
-  proto::OpDesc::Var* matmul_1_y = matmul_1->add_inputs();
-  matmul_1_y->set_parameter("Y");
-  matmul_1_y->add_arguments()->assign("c");
-  proto::OpDesc::Var* matmul_1_out = matmul_1->add_outputs();
-  matmul_1_out->set_parameter("Out");
-  matmul_1_out->add_arguments()->assign("matmul_out_1");
-  proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks();
-  proto::OpDesc* concat = replace->add_ops();
-  concat->set_type("concat");
-  proto::OpDesc::Var* concat_x = concat->add_inputs();
-  concat_x->set_parameter("X");
-  concat_x->add_arguments()->assign("b");
-  concat_x->add_arguments()->assign("c");
-  proto::OpDesc::Var* concat_out = concat->add_outputs();
-  concat_out->set_parameter("Out");
-  concat_out->add_arguments()->assign("concat_out");
-  proto::OpDesc* matmul = replace->add_ops();
-  matmul->set_type("matmul");
-  proto::OpDesc::Var* matmul_x = matmul->add_inputs();
-  matmul_x->set_parameter("X");
-  matmul_x->add_arguments()->assign("a");
-  proto::OpDesc::Var* matmul_y = matmul->add_inputs();
-  matmul_y->set_parameter("Y");
-  matmul_y->add_arguments()->assign("concat_out");
-  proto::OpDesc::Var* matmul_out = matmul->add_outputs();
-  matmul_out->set_parameter("Out");
-  matmul_out->add_arguments()->assign("matmul_out");
-  proto::OpDesc* slice_0 = replace->add_ops();
-  slice_0->set_type("slice");
-  proto::OpDesc::Var* slice_0_x = slice_0->add_inputs();
-  slice_0_x->set_parameter("X");
-  slice_0_x->add_arguments()->assign("matmul_out");
-  proto::OpDesc::Var* slice_0_out = slice_0->add_outputs();
-  slice_0_out->set_parameter("Out");
-  slice_0_out->add_arguments()->assign("slice_out_0");
-  proto::OpDesc* slice_1 = replace->add_ops();
-  slice_1->set_type("slice");
-  proto::OpDesc::Var* slice_1_x = slice_1->add_inputs();
-  slice_1_x->set_parameter("X");
-  slice_1_x->add_arguments()->assign("matmul_out");
-  proto::OpDesc::Var* slice_1_out = slice_1->add_outputs();
-  slice_1_out->set_parameter("Out");
-  slice_1_out->add_arguments()->assign("slice_out_1");
-  for (const char* var : {"a", "b", "c", "matmul_out_0", "matmul_out_1"}) {
-    proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps();
-    var_map->set_pattern_var(var);
-    var_map->set_replace_var(var);
-  }
-  pass_desc->mutable_var_maps(3)->set_replace_var("slice_out_0");
-  pass_desc->mutable_var_maps(4)->set_replace_var("slice_out_1");
-  return multi_pass_desc;
+REGISTER_GENERATE_PASS(generate_combine_matmul) {
+  // pattern
+  SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) {
+    auto matmul1 = OP_(matmul)({{"X", x}, {"Y", y}}).Out("Out");
+    auto matmul2 = OP_(matmul)({{"X", x}, {"Y", z}}).Out("Out");
+    return std::make_tuple(matmul1, matmul2);
+  };
+  // replace
+  SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) {
+    auto concat = OP_(concat)({"X", {y, z}}).Out("Out");
+    auto matmul = OP_(matmul)({{"X", x}, {"Y", concat}}).Out("Out");
+    auto slice1 = OP_(slice)({"X", matmul}).Out("Out");
+    auto slice2 = OP_(slice)({"X", matmul}).Out("Out");
+    return std::make_tuple(slice1, slice2);
+  };
+  return {pattern, replace};
 }
 
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_GENERATE_PASS(generate_fc_fuse,
-                       paddle::framework::ir::generate_fc_fuse);
-REGISTER_GENERATE_PASS(generate_multi_add_to_addn,
-                       paddle::framework::ir::generate_multi_add_to_addn);
-REGISTER_GENERATE_PASS(generate_combine_matmul,
-                       paddle::framework::ir::generate_combine_matmul);
-
 namespace paddle {
 namespace framework {
 namespace ir {
 
 TEST(GeneratePass, construct_with_string) {
   std::string binary_str;
-  generate_fc_fuse().SerializeToString(&binary_str);
+  register_generate_fc_fuse().MultiPassDesc().SerializeToString(&binary_str);
   GeneratePass generate_pass(binary_str);
 }
 
@@ -318,7 +155,7 @@ TEST(GeneratePass, generate_multi_add_to_addn) {
 
   graph.reset(pass->Apply(graph.release()));
   int num_nodes_after = graph->Nodes().size();
-  int num_addn_nodes_after = GetNumOpNodes(graph, "add_n");
+  int num_addn_nodes_after = GetNumOpNodes(graph, "sum");
   VLOG(3) << DebugString(graph);
 
   PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 2,

From 623df4293f1c7e08386f8786d8e6338c043fde25 Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Sat, 9 Oct 2021 12:00:35 +0800
Subject: [PATCH 31/80] support ClipGradByGlobalNorm in sharding (#36012)

* support ClipGradByGlobalNorm in sharding

* support ClipGradByGlobalNorm in sharding

* test=allcase
---
 .../dygraph_optimizer/__init__.py             |  1 +
 .../hybrid_parallel_optimizer.py              | 16 ++++++++++++++--
 .../hybrid_parallel_sharding_model.py         | 19 ++++++++++++-------
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
index f0f26bd2e0d06..28260d7aa1863 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 from .hybrid_parallel_optimizer import HybridParallelOptimizer
 from .hybrid_parallel_gradscaler import HybridParallelGradScaler
+from .dygraph_sharding_optimizer import DygraphShardingOptimizer
 
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index b00ef2cdcb0e1..76e326ce20d7c 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -88,6 +88,13 @@ def _dygraph_clip(self, params_grads):
         paddle.distributed.all_reduce(
             global_norm_var_dist, group=self._hcg.get_check_parallel_group())
 
+        # In Sharding mode, param and grad is mapping different rank in optimizer.
+        # ClipGradByGlobalNorm need allreduce to get globol norm
+        if self._hcg.get_sharding_parallel_world_size() > 1:
+            paddle.distributed.all_reduce(
+                global_norm_var_not_dist,
+                group=self._hcg.get_sharding_parallel_group())
+
         global_norm_var = layers.sqrt(global_norm_var_dist +
                                       global_norm_var_not_dist)
 
@@ -139,8 +146,13 @@ def __init__(self, optimizer, hcg, strategy):
             logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \
                   "optmizer'grad clip will be changed.")
 
-            self._inner_opt._grad_clip = HybridParallelClipGrad(
-                self._inner_opt._grad_clip, hcg)
+            if self._sharding_enable:
+                # change sharding inner_optimizer's _grad_clip
+                self._inner_opt._inner_optimizer._grad_clip = HybridParallelClipGrad(
+                    self._inner_opt._grad_clip, hcg)
+            else:
+                self._inner_opt._grad_clip = HybridParallelClipGrad(
+                    self._inner_opt._grad_clip, hcg)
 
     @imperative_base.no_grad
     @framework.dygraph_only
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
index 2995e4dbf8401..8cb1166cd0d83 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py
@@ -183,21 +183,23 @@ def build_optimizer(self,
                         strategy=None,
                         is_sharding=True,
                         Optimizer="adam"):
-
+        clip = paddle.nn.ClipGradByGlobalNorm(0.5)
         if Optimizer == "adam":
             if is_sharding:
                 optimizer = DygraphShardingOptimizer(
                     hcg=fleet.get_hybrid_communicate_group(),
                     user_defined_strategy=strategy,
                     params=model.parameters(),
-                    inner_optimizer_class=paddle.optimizer.Adam,
+                    inner_optimizer_class=paddle.optimizer.AdamW,
                     learning_rate=0.001,
-                    weight_decay=0.00001, )
+                    weight_decay=0.00001,
+                    grad_clip=clip)
             else:
-                optimizer = paddle.optimizer.Adam(
+                optimizer = paddle.optimizer.AdamW(
                     parameters=model.parameters(),
                     learning_rate=0.001,
-                    weight_decay=0.00001, )
+                    weight_decay=0.00001,
+                    grad_clip=clip)
         else:
             if is_sharding:
                 optimizer = DygraphShardingOptimizer(
@@ -205,10 +207,13 @@ def build_optimizer(self,
                     user_defined_strategy=strategy,
                     params=model.parameters(),
                     inner_optimizer_class=paddle.optimizer.Momentum,
-                    learning_rate=0.001, )
+                    learning_rate=0.001,
+                    grad_clip=clip)
             else:
                 optimizer = paddle.optimizer.Momentum(
-                    learning_rate=0.001, parameters=model.parameters())
+                    learning_rate=0.001,
+                    parameters=model.parameters(),
+                    grad_clip=clip)
         return optimizer
 
     def build_model_optimizer(self, Optimizer="adam"):

From c8a01010e84bf8566a417060f50a43e100a10172 Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Sat, 9 Oct 2021 16:21:39 +0800
Subject: [PATCH 32/80] update fft api path (#36219)

* update fft api path
* add sample code for ihfft2

Co-authored-by: chenfeiyu <chenfeiyu@baidu.com>
---
 python/paddle/__init__.py   |  2 +-
 python/paddle/fft.py        | 61 +++++++++++++++++++++++++++++++++++++
 python/paddle/tensor/fft.py | 44 ++++++++++++--------------
 3 files changed, 81 insertions(+), 26 deletions(-)
 create mode 100644 python/paddle/fft.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ad8640f6f5584..decffa66f4174 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -64,7 +64,6 @@
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
 
-from .tensor import fft
 from .tensor.random import bernoulli  # noqa: F401
 
 from .tensor.attribute import rank  # noqa: F401
@@ -294,6 +293,7 @@
 from .hapi import flops  # noqa: F401
 from . import hub  # noqa: F401
 from . import linalg  # noqa: F401
+from . import fft  # noqa: F401
 
 import paddle.text  # noqa: F401
 import paddle.vision  # noqa: F401
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
new file mode 100644
index 0000000000000..3ac02c9c8dc18
--- /dev/null
+++ b/python/paddle/fft.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tensor.fft import fft  # noqa: F401
+from .tensor.fft import fft2  # noqa: F401
+from .tensor.fft import fftn  # noqa: F401
+from .tensor.fft import ifft  # noqa: F401
+from .tensor.fft import ifft2  # noqa: F401
+from .tensor.fft import ifftn  # noqa: F401
+from .tensor.fft import rfft  # noqa: F401
+from .tensor.fft import rfft2  # noqa: F401
+from .tensor.fft import rfftn  # noqa: F401
+from .tensor.fft import irfft  # noqa: F401
+from .tensor.fft import irfft2  # noqa: F401
+from .tensor.fft import irfftn  # noqa: F401
+from .tensor.fft import hfft  # noqa: F401
+from .tensor.fft import hfft2  # noqa: F401
+from .tensor.fft import hfftn  # noqa: F401
+from .tensor.fft import ihfft  # noqa: F401
+from .tensor.fft import ihfft2  # noqa: F401
+from .tensor.fft import ihfftn  # noqa: F401
+from .tensor.fft import fftfreq  # noqa: F401
+from .tensor.fft import rfftfreq  # noqa: F401
+from .tensor.fft import fftshift  # noqa: F401
+from .tensor.fft import ifftshift  # noqa: F401
+
+__all__ = [ # noqa
+    'fft',
+    'fft2',
+    'fftn',
+    'ifft',
+    'ifft2',
+    'ifftn',
+    'rfft',
+    'rfft2',
+    'rfftn',
+    'irfft',
+    'irfft2',
+    'irfftn',
+    'hfft',
+    'hfft2',
+    'hfftn',
+    'ihfft',
+    'ihfft2',
+    'ihfftn',
+    'fftfreq',
+    'rfftfreq',
+    'fftshift',
+    'ifftshift'
+]
diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
index 98ca858c0eb85..829399d14eaa0 100644
--- a/python/paddle/tensor/fft.py
+++ b/python/paddle/tensor/fft.py
@@ -21,30 +21,7 @@
 from ..fluid.data_feeder import check_variable_and_dtype
 from ..fluid.layer_helper import LayerHelper
 
-__all__ = [
-    'fft',
-    'fft2',
-    'fftn',
-    'ifft',
-    'ifft2',
-    'ifftn',
-    'rfft',
-    'rfft2',
-    'rfftn',
-    'irfft',
-    'irfft2',
-    'irfftn',
-    'hfft',
-    'hfft2',
-    'hfftn',
-    'ihfft',
-    'ihfft2',
-    'ihfftn',
-    'fftfreq',
-    'rfftfreq',
-    'fftshift',
-    'ifftshift',
-]
+__all__ = []
 
 
 def _check_normalization(norm):
@@ -1135,7 +1112,24 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
             refer to :ref:`api_guide_Name` . 
 
     Returns:
-        out(Tensor) : The result of the inverse real 2-D FFT.
+        out(Tensor) : The result of the inverse hermitian 2-D FFT.
+
+    Examples:
+
+        .. code-block:: python
+
+            import numpy as np
+            import paddle
+
+            x = np.mgrid[:5, :5][0].astype(np.float64)
+            xp = paddle.to_tensor(x)
+            ihfft2_xp = paddle.fft.ihfft2(xp).numpy()
+            print(ihfft2_xp)
+            # [[ 2. +0.j          0. +0.j          0. +0.j        ]
+            #  [-0.5-0.68819096j  0. +0.j          0. +0.j        ]
+            #  [-0.5-0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.16245985j  0. +0.j          0. +0.j        ]
+            #  [-0.5+0.68819096j  0. +0.j          0. +0.j        ]]
     """
     _check_at_least_ndim(x, 2)
     if s is not None:

From 62e411508f31814e9b9f71f78769d3ce2101e35b Mon Sep 17 00:00:00 2001
From: zhiboniu <31800336+zhiboniu@users.noreply.github.com>
Date: Sat, 9 Oct 2021 16:35:17 +0800
Subject: [PATCH 33/80] fill_diagonal op fix border cross caused by offset
 (#36212)

---
 paddle/fluid/operators/fill_diagonal_op.cc    | 18 ++++++++---
 paddle/fluid/operators/fill_diagonal_op.cu    | 16 +++++++---
 .../unittests/test_tensor_fill_diagonal_.py   | 30 +++++++++++++++++++
 3 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc
index db55c3e99693a..be3239d504844 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cc
+++ b/paddle/fluid/operators/fill_diagonal_op.cc
@@ -108,8 +108,15 @@ class FillIDiagonalKernel : public framework::OpKernel<T> {
       size = std::min(size, out_dims[1] * out_dims[1]);
     }
 
-    for (int64_t i = offset; i < size; i += strides) {
-      out_data[i] = temp_var;
+    for (int64_t i = 0; i < size; i += strides) {
+      // to check if the new position with offset is still in the same line;
+      // this modify should not affect across lines.
+      // out_dims[1] is also work for tensor with dim>2, for which the dims must
+      // be the same number
+      if (i % out_dims[1] + offset >= 0 &&
+          i % out_dims[1] + offset < out_dims[1]) {
+        out_data[i + offset] = temp_var;
+      }
     }
   }
 };
@@ -176,8 +183,11 @@ class FillIDiagonalGradKernel : public framework::OpKernel<T> {
         wrapsize = size;
       }
 
-      for (int64_t i = offset; i < wrapsize; i += strides) {
-        data[i] = T(0);
+      for (int64_t i = 0; i < wrapsize; i += strides) {
+        if (i % dx_dims[1] + offset >= 0 &&
+            i % dx_dims[1] + offset < dx_dims[1]) {
+          data[i + offset] = T(0);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu
index 5047059fb364d..15eabd4216d0b 100644
--- a/paddle/fluid/operators/fill_diagonal_op.cu
+++ b/paddle/fluid/operators/fill_diagonal_op.cu
@@ -22,11 +22,19 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
 __global__ void fill_constant_kernel(const int64_t featuresize, T* in_data,
-                                     int64_t strides, int offset, T fillvar) {
+                                     int64_t strides, int offset, T fillvar,
+                                     int dims) {
   for (int64_t idx = blockIdx.x * featuresize + threadIdx.x;
        idx * strides + offset < (blockIdx.x + 1) * featuresize;
        idx += blockDim.x) {
-    in_data[idx * strides + offset] = fillvar;
+    // to check if the new position with offset is still in the same line;
+    // this modify should not affect across lines.
+    // out_dims[1] is also work for tensor with dim>2, for which the dims must
+    // be the same number
+    if ((idx * strides) % dims + offset < dims &&
+        (idx * strides) % dims + offset >= 0) {
+      in_data[idx * strides + offset] = fillvar;
+    }
   }
 }
 
@@ -62,7 +70,7 @@ class FillIDiagonalCUDAKernel : public framework::OpKernel<T> {
 
     int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim);
     fill_constant_kernel<T><<<1, kBlockDim, 0>>>(size, out_data, strides,
-                                                 offset, temp_var);
+                                                 offset, temp_var, out_dims[1]);
   }
 };
 
@@ -96,7 +104,7 @@ class FillIDiagonalGradCUDAKernel : public framework::OpKernel<T> {
 
     int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim);
     fill_constant_kernel<T><<<1, kBlockDim, 0>>>(wrapsize, in_data, strides,
-                                                 offset, T(0));
+                                                 offset, T(0), out_dims[1]);
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
index 41a8a9750cb64..3beb6a537eca0 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py
@@ -50,6 +50,36 @@ def test_dim2_normal(self):
                     (y.grad.numpy().astype('float32') == expected_grad).all(),
                     True)
 
+    def test_offset(self):
+        expected_np = np.array(
+            [[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
+        expected_grad = np.array(
+            [[1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+
+        typelist = ['float32', 'float64', 'int32', 'int64']
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+
+        for idx, p in enumerate(places):
+            if idx == 0:
+                paddle.set_device('cpu')
+            else:
+                paddle.set_device('gpu')
+            for dtype in typelist:
+                x = paddle.ones((3, 3), dtype=dtype)
+                x.stop_gradient = False
+                y = x * 2
+                y.fill_diagonal_(1, offset=2, wrap=True)
+                loss = y.sum()
+                loss.backward()
+
+                self.assertEqual(
+                    (y.numpy().astype('float32') == expected_np).all(), True)
+                self.assertEqual(
+                    (y.grad.numpy().astype('float32') == expected_grad).all(),
+                    True)
+
     def test_bool(self):
         expected_np = np.array(
             [[False, True, True], [True, False, True], [True, True, False]])

From 21dc7f40e14a09528711054e8bc329e3d9b15ee2 Mon Sep 17 00:00:00 2001
From: From00 <chenruibiao@baidu.com>
Date: Sat, 9 Oct 2021 19:06:18 +0800
Subject: [PATCH 34/80] Add new API 'tensordot' (#36273)

* Add new API tensordot

* Set timeout value 400 for UT; Fix format for EN docs

* Set timeout value 1000 for UT; Fix format for EN docs

* Remove some input check

* Coding style improve: don't compare boolean values to True or False
using ==
---
 python/paddle/__init__.py                     |   2 +
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../fluid/tests/unittests/test_tensordot.py   | 238 ++++++++++++++++++
 python/paddle/tensor/__init__.py              |   2 +
 python/paddle/tensor/manipulation.py          | 208 +++++++++++++++
 5 files changed, 451 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensordot.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index decffa66f4174..2051a4f6fcd50 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -151,6 +151,7 @@
 from .tensor.manipulation import roll  # noqa: F401
 from .tensor.manipulation import chunk  # noqa: F401
 from .tensor.manipulation import tolist  # noqa: F401
+from .tensor.manipulation import tensordot  # noqa: F401
 from .tensor.math import abs  # noqa: F401
 from .tensor.math import acos  # noqa: F401
 from .tensor.math import asin  # noqa: F401
@@ -470,6 +471,7 @@
            'bmm',
            'chunk',
            'tolist',
+           'tensordot',
            'greater_than',
            'shard_index',
            'argsort',
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index cd1c4363879bb..61a43aeb44e84 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1042,3 +1042,4 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
+set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
new file mode 100644
index 0000000000000..29f3308988f6d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -0,0 +1,238 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import paddle.fluid.core as core
+import numpy as np
+import itertools as it
+
+np.set_printoptions(threshold=np.inf)
+
+
+def tensordot_np(x, y, axes):
+    if isinstance(axes, paddle.fluid.framework.Variable):
+        axes = axes.tolist()
+
+    # np.tensordot does not support empty axes
+    if not axes:
+        axes = 0
+    if (isinstance(axes, (tuple, list))):
+        if all(np.issubdtype(type(i), np.integer) for i in axes):
+            axes = [axes, axes]
+        else:
+            axes_x = axes[0]
+            if len(axes) > 1:
+                axes_y = axes[1]
+            else:
+                axes_y = axes_x
+            len_axes_x, len_axes_y = len(axes_x), len(axes_y)
+            if len_axes_x < len_axes_y:
+                axes_x = axes_x + axes_y[len_axes_x:]
+            elif len_axes_y < len_axes_x:
+                axes_y = axes_y + axes_x[len_axes_y:]
+            axes = [axes_x, axes_y]
+
+    # np.tensordot does not support broadcast
+    if (isinstance(axes, (tuple, list))):
+        axes_x, axes_y = axes
+    else:
+        axes_x = list(range(x.ndim - axes, x.ndim))
+        axes_y = list(range(axes))
+    shape_x, shape_y = list(np.shape(x)), list(np.shape(y))
+    for i in range(len(axes_x)):
+        dim_x, dim_y = axes_x[i], axes_y[i]
+        sx, sy = shape_x[dim_x], shape_y[dim_y]
+        if sx == 1:
+            shape_y[dim_y] = 1
+            y = np.sum(y, dim_y)
+            y = np.reshape(y, shape_y)
+        elif sy == 1:
+            shape_x[dim_x] = 1
+            x = np.sum(x, dim_x)
+            x = np.reshape(x, shape_x)
+
+    return np.tensordot(x, y, axes)
+
+
+class TestTensordotAPI(unittest.TestCase):
+    def setUp(self):
+        self.set_dtype()
+        self.set_input_shape()
+        self.set_input_data()
+
+    def set_dtype(self):
+        self.dtype = np.float32
+
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 5]
+        self.y_shape = [5, 5, 5, 5]
+
+    def set_input_data(self):
+        self.x = np.random.random(self.x_shape).astype(self.dtype)
+        self.y = np.random.random(self.y_shape).astype(self.dtype)
+        self.all_axes = [2]
+
+    def run_dygraph(self, place):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x, place=place)
+        y = paddle.to_tensor(self.y, place=place)
+        paddle_res = paddle.tensordot(x, y, self.axes)
+        np_res = tensordot_np(self.x, self.y, self.axes)
+        np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            x = paddle.static.data(
+                name='x', shape=self.x_shape, dtype=self.dtype)
+            y = paddle.static.data(
+                name='y', shape=self.y_shape, dtype=self.dtype)
+            z = paddle.tensordot(x, y, self.axes)
+            exe = paddle.static.Executor(place)
+            paddle_res = exe.run(feed={'x': self.x,
+                                       'y': self.y},
+                                 fetch_list=[z])
+            np_res = tensordot_np(self.x, self.y, self.axes)
+            np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
+
+    def test_cases(self):
+        self.all_axes = []
+        axial_index = range(4)
+        all_permutations = list(it.permutations(axial_index, 0)) + list(
+            it.permutations(axial_index, 1)) + list(
+                it.permutations(axial_index, 2)) + list(
+                    it.permutations(axial_index, 3)) + list(
+                        it.permutations(axial_index, 4))
+        self.all_axes.extend(list(i) for i in all_permutations)
+
+        for axes_x in all_permutations:
+            for axes_y in all_permutations:
+                if len(axes_x) < len(axes_y):
+                    supplementary_axes_x = axes_x + axes_y[len(axes_x):]
+                    if any(
+                            supplementary_axes_x.count(i) > 1
+                            for i in supplementary_axes_x):
+                        continue
+                elif len(axes_y) < len(axes_x):
+                    supplementary_axes_y = axes_y + axes_x[len(axes_y):]
+                    if any(
+                            supplementary_axes_y.count(i) > 1
+                            for i in supplementary_axes_y):
+                        continue
+                self.all_axes.append([list(axes_x), list(axes_y)])
+
+        self.all_axes.extend(range(5))
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for axes in self.all_axes:
+            self.axes = axes
+            for place in places:
+                self.run_dygraph(place)
+                self.run_static(place)
+
+
+class TestTensordotAPIFloat64(TestTensordotAPI):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
+class TestTensordotAPIAxesType(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [3, 4, 4]
+        self.y_shape = [4, 4, 5]
+
+    def test_cases(self):
+        self.all_axes = [
+            0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
+                (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
+            [[1, 2], [0, 1]]
+        ]
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for axes in self.all_axes:
+            self.axes = axes
+            for place in places:
+                self.run_dygraph(place)
+                self.run_static(place)
+
+        # The 'axes' with type 'Tensor' in tensordot is not available in static mode
+        paddle.disable_static()
+        for place in places:
+            self.all_axes = [
+                paddle.to_tensor([1]), (paddle.to_tensor([1])),
+                (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
+                [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
+                paddle.to_tensor([[1, 2], [0, 1]])
+            ]
+            for axes in self.all_axes:
+                self.axes = axes
+                for place in places:
+                    self.run_dygraph(place)
+
+    def test_error(self):
+        self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]],
+                         [[1, 2], [0, -1]], [0, 1, 2, 3]]
+        paddle.disable_static()
+        x = paddle.to_tensor(self.x)
+        y = paddle.to_tensor(self.y)
+        for axes in self.all_axes:
+            with self.assertRaises(BaseException):
+                paddle.tensordot(x, y, axes)
+
+
+class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType):
+    def set_dtype(self):
+        self.dtype = np.float64
+
+
+class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 1, 5]
+        self.y_shape = [1, 5, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [1, 5, 5, 5]
+        self.y_shape = [1, 1, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [5, 5, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [1, 1, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 5, 5]
+        self.y_shape = [5, 5, 1, 5]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index b5d79b6039320..c8f897c21648f 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -105,6 +105,7 @@
 from .manipulation import unbind  # noqa: F401
 from .manipulation import roll  # noqa: F401
 from .manipulation import chunk  # noqa: F401
+from .manipulation import tensordot  # noqa: F401
 from .math import abs  # noqa: F401
 from .math import acos  # noqa: F401
 from .math import asin  # noqa: F401
@@ -346,6 +347,7 @@
            'slice',
            'split',
            'chunk',
+           'tensordot',
            'squeeze',
            'squeeze_',
            'stack',
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 4129a1060daf9..5f7588cb2a9a0 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -2173,3 +2173,211 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
 
     return paddle.fluid.layers.strided_slice(
         input=x, axes=axes, starts=starts, ends=ends, strides=strides)
+
+
+def tensordot(x, y, axes=2, name=None):
+    r"""
+    This function computes a contraction, which sum the product of elements from two tensors along the given axes. 
+
+    Args:
+        x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``.
+        y (Tensor): The right tensor for contraction with the same data type as ``x``.
+        axes (int|tuple|list|Tensor, optional):  The axes to contract for ``x`` and ``y``, defaulted to integer ``2``.
+
+            1. It could be a non-negative integer ``n``, 
+               in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order.
+        
+            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. 
+               For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``.
+        
+            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. 
+               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. 
+               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. 
+               When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored.
+        
+            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list 
+               and applied the same rules described above to determine the contraction axes. 
+               Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode.
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property. 
+                             For more information, please refer to :ref:`api_guide_Name` .
+
+    Return: 
+        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. 
+        In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted.
+    
+    NOTES:
+        1. This function supports tensor broadcast, 
+           the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules.
+        2. This function also supports axes expansion, 
+           when the two given axis sequences for ``x`` and ``y`` are of different lengths, 
+           the shorter sequence will expand the same axes as the longer one at the end. 
+           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], 
+           the axis sequence for ``x`` is [0, 1, 2, 3], 
+           while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3].
+  
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            data_type = 'float64'
+
+            # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
+            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.   
+            x = paddle.arange(4, dtype=data_type).reshape([2, 2])
+            y = paddle.arange(4, dtype=data_type).reshape([2, 2])
+            z = paddle.tensordot(x, y, axes=0)
+            # z = [[[[0., 0.],
+            #        [0., 0.]],
+            #
+            #       [[0., 1.],
+            #        [2., 3.]]],
+            #
+            #
+            #      [[[0., 2.],
+            #        [4., 6.]],
+            #
+            #       [[0., 3.],
+            #        [6., 9.]]]]
+
+
+            # For two 1-d tensor x and y, the case axes=1 is equivalent to inner product.
+            x = paddle.arange(10, dtype=data_type)
+            y = paddle.arange(10, dtype=data_type)
+            z1 = paddle.tensordot(x, y, axes=1)
+            z2 = paddle.dot(x, y)
+            # z1 = z2 = [285.]
+
+
+            # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication.
+            x = paddle.arange(6, dtype=data_type).reshape([2, 3])
+            y = paddle.arange(12, dtype=data_type).reshape([3, 4])
+            z1 = paddle.tensordot(x, y, axes=1)
+            z2 = paddle.matmul(x, y)
+            # z1 = z2 =  [[20., 23., 26., 29.],
+            #             [56., 68., 80., 92.]]
+
+
+            # When axes is a 1-d int list, x and y will be contracted along the same given axes.
+            # Note that axes=[1, 2] is equivalent to axes=[[1, 2]], axes=[[1, 2], []], axes=[[1, 2], [1]], and axes=[[1, 2], [1, 2]].
+            x = paddle.arange(24, dtype=data_type).reshape([2, 3, 4])
+            y = paddle.arange(36, dtype=data_type).reshape([3, 3, 4])
+            z = paddle.tensordot(x, y, axes=[1, 2])
+            # z =  [[506. , 1298., 2090.],
+            #       [1298., 3818., 6338.]]
+
+
+            # When axes is a list containing two 1-d int list, the first will be applied to x and the second to y.
+            x = paddle.arange(60, dtype=data_type).reshape([3, 4, 5])
+            y = paddle.arange(24, dtype=data_type).reshape([4, 3, 2])
+            z = paddle.tensordot(x, y, axes=([1, 0], [0, 1]))
+            # z =  [[4400., 4730.],
+            #       [4532., 4874.],
+            #       [4664., 5018.],
+            #       [4796., 5162.],
+            #       [4928., 5306.]]
+
+
+            # Thanks to the support of axes expansion, axes=[[0, 1, 3, 4], [1, 0, 3, 4]] can be abbreviated as axes= [[0, 1, 3, 4], [1, 0]].
+            x = paddle.arange(720, dtype=data_type).reshape([2, 3, 4, 5, 6])
+            y = paddle.arange(720, dtype=data_type).reshape([3, 2, 4, 5, 6])
+            z = paddle.tensordot(x, y, axes=[[0, 1, 3, 4], [1, 0]])
+            # z = [[23217330., 24915630., 26613930., 28312230.],
+            #      [24915630., 26775930., 28636230., 30496530.],
+            #      [26613930., 28636230., 30658530., 32680830.],
+            #      [28312230., 30496530., 32680830., 34865130.]] 
+    """
+    op_type = 'tensordot'
+    input_dtype = ['float32', 'float64']
+
+    check_variable_and_dtype(x, 'x', input_dtype, op_type)
+    check_variable_and_dtype(y, 'y', input_dtype, op_type)
+    check_type(axes, 'axes', (int, tuple, list, Variable), op_type)
+
+    def _var_to_list(var):
+        if in_dygraph_mode():
+            return tolist(var)
+        raise TypeError(
+            "The 'axes' with type 'Tensor' in " + op_type +
+            " is not available in static graph mode, "
+            "please convert its type to int|Tuple|List, or use dynamic graph mode."
+        )
+
+    axes_x = []
+    axes_y = []
+    if np.issubdtype(type(axes), np.integer):
+        assert axes >= 0, (
+            "The 'axes' in " + op_type +
+            f" should not be negative, but received axes={axes}.")
+        axes_x = range(x.ndim - axes, x.ndim)
+        axes_y = range(axes)
+    else:
+        if isinstance(axes, Variable):
+            axes = _var_to_list(axes)
+
+        if not axes or np.issubdtype(type(axes[0]), np.integer):
+            axes_x = axes
+        else:
+            axes_x = axes[0]
+            if len(axes) > 1:
+                axes_y = axes[1]
+
+            if isinstance(axes_x, Variable):
+                axes_x = _var_to_list(axes_x)
+            if isinstance(axes_y, Variable):
+                axes_y = _var_to_list(axes_y)
+
+    axes_x, axes_y = list(axes_x), list(axes_y)
+    len_axes_x, len_axes_y = len(axes_x), len(axes_y)
+    if len_axes_x < len_axes_y:
+        axes_x.extend(axes_y[len_axes_x:])
+    elif len_axes_y < len_axes_x:
+        axes_y.extend(axes_x[len_axes_y:])
+
+    shape_x, shape_y = list(x.shape), list(y.shape)
+    need_contracted_dim_x = np.zeros((x.ndim), dtype=bool)
+    need_contracted_dim_y = np.zeros((y.ndim), dtype=bool)
+    contraction_size = 1
+    for i in range(len(axes_x)):
+        dim_x, dim_y = axes_x[i], axes_y[i]
+        sx, sy = shape_x[dim_x], shape_y[dim_y]
+        if sx == 1:
+            shape_y[dim_y] = 1
+            y = y.sum(dim_y).reshape(shape_y)
+        elif sy == 1:
+            shape_x[dim_x] = 1
+            x = x.sum(dim_x).reshape(shape_x)
+        else:
+            assert sx == sy, "The dimensional size for 'x' and 'y' in " + op_type + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}."
+
+        need_contracted_dim_x[dim_x] = True
+        need_contracted_dim_y[dim_y] = True
+        contraction_size *= shape_x[dim_x]
+
+    perm_x = []
+    perm_y = []
+    shape_out = []
+    not_contraction_size_x = 1
+    not_contraction_size_y = 1
+    for i in range(x.ndim):
+        if not need_contracted_dim_x[i]:
+            perm_x.append(i)
+            shape_out.append(shape_x[i])
+            not_contraction_size_x *= shape_x[i]
+    perm_x.extend(axes_x)
+    perm_y.extend(axes_y)
+    for i in range(y.ndim):
+        if not need_contracted_dim_y[i]:
+            perm_y.append(i)
+            shape_out.append(shape_y[i])
+            not_contraction_size_y *= shape_y[i]
+
+    if not shape_out:
+        shape_out = [1]
+
+    x = x.transpose(perm=perm_x).reshape(
+        [not_contraction_size_x, contraction_size])
+    y = y.transpose(perm=perm_y).reshape(
+        [contraction_size, not_contraction_size_y])
+    out = x.matmul(y).reshape(shape_out)
+    return out

From cb620ca6de8909eed0ed14620dbb0c60628def86 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Sat, 9 Oct 2021 19:09:40 +0800
Subject: [PATCH 35/80] Add const for OpDesc::id() and VarDesc::id() (#36298)

* add const OpDesc id()

* add const for VarDesc::id()
---
 paddle/fluid/framework/op_desc.h  | 2 +-
 paddle/fluid/framework/var_desc.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
index 0eafbb027f042..9470fd9b69933 100644
--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -164,7 +164,7 @@ class OpDesc {
 
   // Note: the identity only used as a key for referring to its
   // distributed attribute now.
-  uint64_t Id() { return id_; }
+  uint64_t Id() const { return id_; }
 
  private:
   template <typename MapType>
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index d1a1757d5309b..a6f56ad445834 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -160,7 +160,7 @@ class VarDesc {
 
   // Note: the identity only used as a key for referring to its
   // distributed attribute now.
-  uint64_t Id() { return id_; }
+  uint64_t Id() const { return id_; }
 
  private:
   const proto::VarType::TensorDesc &tensor_desc() const;

From 91119271584dbf6cefe86a170e078d245bf912e5 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Sat, 9 Oct 2021 19:20:51 +0800
Subject: [PATCH 36/80] Enhance OpTest for bfloat16. (#36079)

---
 paddle/fluid/operators/cast_op.cu             | 33 +++----
 .../paddle/fluid/tests/unittests/op_test.py   | 86 +++++++++++++------
 .../fluid/tests/unittests/test_cast_op.py     | 38 +++++++-
 3 files changed, 106 insertions(+), 51 deletions(-)

diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 601735c2f148a..05a110fe65b83 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -94,24 +94,19 @@ class CastCUDAOpKernel : public framework::OpKernel<InT> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-
-#ifdef PADDLE_WITH_HIP
-REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
-    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
-    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
-    ops::CastCUDAOpKernel<uint8_t>,
-    ops::CastCUDAOpKernel<paddle::platform::float16>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
+namespace plat = paddle::platform;
+
+#define REGISTER_CAST_CUDA_BASE(op_name, ...)                               \
+  REGISTER_OP_CUDA_KERNEL(                                                  \
+      op_name, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>, \
+      ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,           \
+      ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,          \
+      ops::CastCUDAOpKernel<uint8_t>, ops::CastCUDAOpKernel<plat::float16>, \
+      ops::CastCUDAOpKernel<plat::complex<float>>,                          \
+      ops::CastCUDAOpKernel<plat::complex<double>>, ##__VA_ARGS__);
+
+#if !defined(PADDLE_WITH_HIP)
+REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel<plat::bfloat16>)
 #else
-REGISTER_OP_CUDA_KERNEL(
-    cast, ops::CastCUDAOpKernel<float>, ops::CastCUDAOpKernel<double>,
-    ops::CastCUDAOpKernel<int>, ops::CastCUDAOpKernel<int64_t>,
-    ops::CastCUDAOpKernel<int16_t>, ops::CastCUDAOpKernel<bool>,
-    ops::CastCUDAOpKernel<uint8_t>,
-    ops::CastCUDAOpKernel<paddle::platform::float16>,
-    ops::CastCUDAOpKernel<paddle::platform::bfloat16>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<float>>,
-    ops::CastCUDAOpKernel<paddle::platform::complex<double>>);
+REGISTER_CAST_CUDA_BASE(cast)
 #endif
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 3621d20fa2472..41fd0b442fe1c 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -147,6 +147,9 @@ def get_output():
         op.run(scope, place)
         for output_name in output_names:
             output_numpy = np.array(scope.find_var(output_name).get_tensor())
+            # numpy.dtype does not have bfloat16, thus we use numpy.uint16 to
+            # store bfloat16 data, and need to be converted to float to check
+            # the floating precision.
             if tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
                 output_numpy = convert_uint16_to_float(output_numpy)
             sum.append(output_numpy.astype(tensor_to_check_dtype).mean())
@@ -362,11 +365,26 @@ def try_call_once(self, data_type):
             self.dtype = data_type
 
     def is_bfloat16_op(self):
+        # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs.
+        # Make sure this function is called after calling infer_dtype_from_inputs_outputs.
         return self.dtype == np.uint16 or (
-            hasattr(self, 'mkldnn_data_type') and
-            getattr(self, 'mkldnn_data_type') is "bfloat16") or (
-                hasattr(self, 'attrs') and 'mkldnn_data_type' in self.attrs and
-                self.attrs['mkldnn_data_type'] == 'bfloat16')
+            hasattr(self, 'output_dtype') and
+            self.output_dtype == np.uint16) or (
+                hasattr(self, 'mkldnn_data_type') and
+                getattr(self, 'mkldnn_data_type') is "bfloat16") or (
+                    hasattr(self, 'attrs') and
+                    'mkldnn_data_type' in self.attrs and
+                    self.attrs['mkldnn_data_type'] == 'bfloat16')
+
+    def is_mkldnn_op(self):
+        return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or (
+            hasattr(self, "attrs") and "use_mkldnn" in self.attrs and
+            self.attrs["use_mkldnn"] == True)
+
+    def is_xpu_op(self):
+        return (hasattr(self, "use_xpu") and self.use_xpu == True) or (
+            hasattr(self, "attrs") and "use_xpu" in self.attrs and
+            self.attrs["use_xpu"] == True)
 
     def infer_dtype_from_inputs_outputs(self, inputs, outputs):
         def is_np_data(input):
@@ -398,8 +416,8 @@ def infer_dtype(numpy_dict, dtype_set):
 
         # infer dtype from inputs, and dtype means the precision of the test
         # collect dtype of all inputs
-        dtype_set = set()
-        infer_dtype(inputs, dtype_set)
+        input_dtype_set = set()
+        infer_dtype(inputs, input_dtype_set)
         dtype_list = [
             np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16),
             np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16),
@@ -408,12 +426,20 @@ def infer_dtype(numpy_dict, dtype_set):
         ]
         # check the dtype in dtype_list in order, select the first dtype that in dtype_set
         for dtype in dtype_list:
-            if dtype in dtype_set:
+            if dtype in input_dtype_set:
                 self.dtype = dtype
                 break
-        # save dtype in class attr
+        # save input dtype in class attr
         self.__class__.dtype = self.dtype
 
+        # infer dtype of outputs
+        output_dtype_set = set()
+        infer_dtype(outputs, output_dtype_set)
+        for dtype in dtype_list:
+            if dtype in output_dtype_set:
+                self.output_dtype = dtype
+                break
+
     def feed_var(self, input_vars, place):
         feed_map = {}
         for var_name in input_vars:
@@ -439,14 +465,10 @@ def feed_var(self, input_vars, place):
 
     def _append_ops(self, block):
         self.__class__.op_type = self.op_type  # for ci check, please not delete it for now
-        if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \
-            (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \
-                    self.attrs["use_mkldnn"] == True):
+        if self.is_mkldnn_op():
             self.__class__.use_mkldnn = True
 
-        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
-            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
-                    self.attrs["use_xpu"] == True):
+        if self.is_xpu_op():
             self.__class__.use_xpu = True
 
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
@@ -1092,12 +1114,15 @@ def check_output_with_place(self,
             atol = 0
 
         if self.is_bfloat16_op():
-            check_dygraph = False
-            if hasattr(self, 'force_fp32_output') and getattr(
-                    self, 'force_fp32_output'):
-                atol = 1e-2
+            if self.is_mkldnn_op():
+                check_dygraph = False
+                if hasattr(self, 'force_fp32_output') and getattr(
+                        self, 'force_fp32_output'):
+                    atol = 1e-2
+                else:
+                    atol = 2
             else:
-                atol = 2
+                atol = 1e-2
 
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
@@ -1193,6 +1218,7 @@ def find_actual(target_name, fetch_list):
                 expect = self.outputs[out_name]
                 expect_t = expect[0] if isinstance(expect, tuple) else expect
 
+                # np.uint16 represents bfloat16
                 if actual_t.dtype == np.uint16 and expect_t.dtype in [
                         np.float32, np.float64
                 ]:
@@ -1205,6 +1231,7 @@ def find_actual(target_name, fetch_list):
                     expect_t = convert_uint16_to_float(expect_t)
                     actual_t = convert_uint16_to_float(actual_t)
                     atol = max(atol, 0.03)
+
                 # NOTE(zhiqiu): np.allclose([], [1.]) returns True
                 # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng
                 if expect_t.size == 0:
@@ -1214,13 +1241,19 @@ def find_actual(target_name, fetch_list):
                     np.allclose(
                         actual_t,
                         expect_t,
-                        rtol=rtol,
                         atol=atol,
+                        rtol=rtol,
                         equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t) + " in class " + self.__class__.__name__)
                 if check_dygraph:
+                    if self.is_bfloat16_op():
+                        if imperative_actual_t.dtype == np.uint16:
+                            imperative_actual_t = convert_uint16_to_float(
+                                imperative_actual_t)
+                        if expect_t.dtype == np.uint16:
+                            expect_t = convert_uint16_to_float(expect_t)
                     if six.moves.reduce(
                             lambda x, y: x * y, imperative_actual_t.shape,
                             1) == 0 and six.moves.reduce(
@@ -1232,6 +1265,7 @@ def find_actual(target_name, fetch_list):
                                 imperative_actual_t,
                                 expect_t,
                                 atol=atol,
+                                rtol=rtol,
                                 equal_nan=equal_nan),
                             "Output (" + out_name + ") has diff at " +
                             str(place) + "\nExpect " + str(expect_t) + "\n" +
@@ -1340,14 +1374,10 @@ def check_output(self,
                      check_dygraph=True,
                      inplace_atol=None):
         self.__class__.op_type = self.op_type
-        if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \
-            (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \
-                    self.attrs["use_mkldnn"] == True):
+        if self.is_mkldnn_op():
             self.__class__.use_mkldnn = True
 
-        if (hasattr(self, "use_xpu") and self.use_xpu == True) or \
-            (hasattr(self, "attrs") and "use_xpu" in self.attrs and \
-                    self.attrs["use_xpu"] == True):
+        if self.is_xpu_op():
             self.__class__.use_xpu = True
 
         places = self._get_places()
@@ -1452,10 +1482,10 @@ def check_grad_with_place(self,
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
         op_attrs = self.attrs if hasattr(self, "attrs") else dict()
 
-        if self.is_bfloat16_op():
+        self._check_grad_helper()
+        if self.is_bfloat16_op() and self.is_mkldnn_op():
             check_dygraph = False
 
-        self._check_grad_helper()
         if self.dtype == np.float64 and \
             self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST:
             numeric_grad_delta = 1e-5
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index 0fc3dccab4a64..948e344e4c158 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -14,7 +14,6 @@
 
 from __future__ import print_function
 
-import op_test
 import unittest
 import numpy as np
 
@@ -22,9 +21,10 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
 
 
-class TestCastOp1(op_test.OpTest):
+class TestCastOpFp32ToFp64(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -42,7 +42,7 @@ def test_grad(self):
         self.check_grad(['X'], ['Out'])
 
 
-class TestCastOp2(op_test.OpTest):
+class TestCastOpFp16ToFp32(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float16')}
@@ -57,7 +57,7 @@ def test_check_output(self):
         self.check_output(atol=1e-3)
 
 
-class TestCastOp3(op_test.OpTest):
+class TestCastOpFp32ToFp16(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10])
         self.inputs = {'X': ipt.astype('float32')}
@@ -72,6 +72,36 @@ def test_check_output(self):
         self.check_output(atol=1e-3)
 
 
+class TestCastOpBf16ToFp32(OpTest):
+    def setUp(self):
+        ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_uint16_to_float(ipt)}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.BF16),
+            'out_dtype': int(core.VarDesc.VarType.FP32)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCastOpFp32ToBf16(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10]).astype('float32')
+        self.inputs = {'X': ipt}
+        self.outputs = {'Out': convert_float_to_uint16(ipt)}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.BF16)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestCastOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):

From 7e6c0ceef27ec8e0f7fa15d688babd4ee67d20f0 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Sat, 9 Oct 2021 21:04:41 +0800
Subject: [PATCH 37/80] Implement Fused BN + Add + Relu with cudnnFusedOps API.
 (#35955)

---
 paddle/fluid/operators/fused/CMakeLists.txt   |   1 +
 .../operators/fused/cudnn_bn_add_relu_test.cc | 380 ++++++++++++++++++
 .../fused/cudnn_bn_stats_finalize.cu.h        | 181 +++++++++
 .../fused/cudnn_scale_bias_add_relu.cu.h      | 292 ++++++++++++++
 4 files changed, 854 insertions(+)
 create mode 100644 paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
 create mode 100644 paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
 create mode 100644 paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 599be6912b760..2630c12db2fc9 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -80,5 +80,6 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
     if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
         cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory)
+        cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory)
     endif()
 endif()
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
new file mode 100644
index 0000000000000..7229754cb8ed8
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -0,0 +1,380 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
+#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace op = paddle::operators;
+using Tensor = paddle::framework::Tensor;
+
+USE_OP(batch_norm);
+
+template <typename T>
+void InitRandomTensor(const std::vector<int64_t> &dims,
+                      framework::Tensor *cpu_out) {
+  T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims),
+                                            platform::CPUPlace());
+  std::default_random_engine random(0);
+  std::uniform_real_distribution<float> dis(0.0, 1.0);
+  for (int i = 0; i < cpu_out->numel(); ++i) {
+    cpu_out_ptr[i] = static_cast<T>(dis(random));
+  }
+}
+
+template <typename T>
+void InitConstantTensor(const std::vector<int64_t> &dims, T value,
+                        framework::Tensor *cpu_out) {
+  T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims),
+                                            platform::CPUPlace());
+  for (int i = 0; i < cpu_out->numel(); ++i) {
+    cpu_out_ptr[i] = value;
+  }
+}
+
+template <typename T>
+void CheckOutput(std::string name, const framework::Tensor &cpu_res,
+                 const framework::Tensor &cpu_base, float diff,
+                 bool is_relative_atol = false) {
+  if (cpu_res.dims().size() == cpu_base.dims().size()) {
+    EXPECT_EQ(cpu_res.dims(), cpu_base.dims());
+  } else {
+    EXPECT_EQ(cpu_res.numel(), cpu_base.numel());
+  }
+
+  const T *cpu_res_ptr = cpu_res.data<T>();
+  const T *cpu_base_ptr = cpu_base.data<T>();
+  float max_diff = 0;
+  int index = 0;
+  for (int i = 0; i < cpu_res.numel(); ++i) {
+    float cur_diff;
+    if (is_relative_atol) {
+      cur_diff = static_cast<float>(
+          std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / cpu_base_ptr[i]));
+      EXPECT_LT(static_cast<float>(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) /
+                                            cpu_base_ptr[i])),
+                diff);
+    } else {
+      cur_diff = static_cast<float>(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i]));
+      EXPECT_LT(static_cast<float>(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])),
+                diff);
+    }
+    if (cur_diff > max_diff) {
+      max_diff = cur_diff;
+      index = i;
+    }
+  }
+  std::string error_type = is_relative_atol ? "relative" : "absolute";
+  LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims()
+            << "], maximum " << error_type << " error is " << max_diff << ": "
+            << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index];
+}
+
+template <typename T>
+void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
+                            framework::Tensor *cpu_sum,
+                            framework::Tensor *cpu_sum_of_square) {
+  // x is in NHWC format.
+  auto dims = cpu_x.dims();
+  int64_t c = dims[3];
+
+  const T *cpu_x_ptr = cpu_x.data<T>();
+  float *cpu_sum_ptr =
+      cpu_sum->mutable_data<float>({1, 1, 1, c}, platform::CPUPlace());
+  float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data<float>(
+      {1, 1, 1, c}, platform::CPUPlace());
+
+  for (int j = 0; j < c; ++j) {
+    float tmp_sum = 0.0f;
+    float tmp_sum_of_squares = 0.0f;
+    for (int i = 0; i < cpu_x.numel() / c; ++i) {
+      float tmp_x = static_cast<float>(cpu_x_ptr[i * c + j]);
+      tmp_sum += tmp_x;
+      tmp_sum_of_squares += tmp_x * tmp_x;
+    }
+    cpu_sum_ptr[j] = tmp_sum;
+    cpu_sum_square_ptr[j] = tmp_sum_of_squares;
+  }
+}
+
+// get paddle batchnorm op results as baseline
+void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
+                             const Tensor &cpu_x, const Tensor &cpu_scale,
+                             const Tensor &cpu_bias, Tensor *cpu_mean,
+                             Tensor *cpu_var, Tensor *cpu_saved_mean,
+                             Tensor *cpu_saved_var, Tensor *cpu_y,
+                             Tensor *cpu_reserve_space) {
+  framework::Scope scope;
+  auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
+  auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
+  auto *bias = scope.Var("Bias")->GetMutable<framework::LoDTensor>();
+  auto *mean = scope.Var("Mean")->GetMutable<framework::LoDTensor>();
+  auto *var = scope.Var("Variance")->GetMutable<framework::LoDTensor>();
+  auto *y = scope.Var("Y")->GetMutable<framework::LoDTensor>();
+  auto *saved_mean = scope.Var("SavedMean")->GetMutable<framework::LoDTensor>();
+  auto *saved_var =
+      scope.Var("SavedVariance")->GetMutable<framework::LoDTensor>();
+  auto *reserve_space =
+      scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorCopySync(cpu_x, place, x);
+  TensorCopySync(cpu_scale, place, scale);
+  TensorCopySync(cpu_bias, place, bias);
+  TensorCopySync(*cpu_mean, place, mean);
+  TensorCopySync(*cpu_var, place, var);
+
+  int64_t channels = x->dims()[3];
+  scale->Resize({channels});
+  bias->Resize({channels});
+  mean->Resize({channels});
+  var->Resize({channels});
+
+  framework::AttributeMap attrs;
+  std::string data_layout = "NHWC";
+  attrs.insert({"data_layout", data_layout});
+
+  auto op = framework::OpRegistry::CreateOp(
+      "batch_norm", {{"X", {"X"}},
+                     {"Scale", {"Scale"}},
+                     {"Bias", {"Bias"}},
+                     {"Mean", {"Mean"}},
+                     {"Variance", {"Variance"}}},
+      {{"Y", {"Y"}},
+       {"MeanOut", {"Mean"}},
+       {"VarianceOut", {"Variance"}},
+       {"SavedMean", {"SavedMean"}},
+       {"SavedVariance", {"SavedVariance"}},
+       {"ReserveSpace", {"ReserveSpace"}}},
+      attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  TensorCopySync(*y, platform::CPUPlace(), cpu_y);
+  TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
+  TensorCopySync(*var, platform::CPUPlace(), cpu_var);
+  TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
+  TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
+  TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space);
+}
+
+template <typename T>
+class CudnnBNAddReluTester {
+ public:
+  CudnnBNAddReluTester(int batch_size, int height, int width, int channels) {
+    batch_size_ = batch_size;
+    height_ = height;
+    width_ = width;
+    channels_ = channels;
+    ele_count_ = batch_size_ * height_ * width_;
+    SetUp();
+  }
+
+  ~CudnnBNAddReluTester() {}
+
+  void CheckForward(float diff, bool is_relative_atol = false) {
+    platform::CUDADeviceContext *ctx =
+        static_cast<platform::CUDADeviceContext *>(
+            platform::DeviceContextPool::Instance().Get(
+                platform::CUDAPlace(0)));
+
+    framework::Tensor cpu_mean_base;
+    framework::Tensor cpu_var_base;
+    framework::Tensor cpu_saved_mean_base;
+    framework::Tensor cpu_saved_var_base;
+    framework::Tensor cpu_y_base;
+    framework::Tensor cpu_reserve_space_base;
+    BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base,
+                    &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base);
+
+    framework::Tensor cpu_mean;
+    framework::Tensor cpu_var;
+    framework::Tensor cpu_saved_mean;
+    framework::Tensor cpu_saved_var;
+    framework::Tensor cpu_y;
+    framework::Tensor cpu_bitmask;
+    FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var,
+                 &cpu_y, &cpu_bitmask);
+
+    CheckOutput<float>("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol);
+    CheckOutput<float>("Variance", cpu_var, cpu_var_base, diff,
+                       is_relative_atol);
+    CheckOutput<float>("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff,
+                       is_relative_atol);
+    CheckOutput<float>("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff,
+                       is_relative_atol);
+    CheckOutput<T>("Y", cpu_y, cpu_y_base, diff, is_relative_atol);
+  }
+
+ private:
+  void SetUp() {
+    // Initialize input data
+    InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_x_);
+    ComputeSumAndSquareSum<T>(cpu_x_, &cpu_sum_, &cpu_sum_of_square_);
+
+    // scale and bias should be initialized randomly.
+    InitConstantTensor<float>({channels_}, static_cast<float>(1.0f),
+                              &cpu_bn_scale_);
+    InitConstantTensor<float>({channels_}, static_cast<float>(0.0f),
+                              &cpu_bn_bias_);
+  }
+
+  void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean,
+                   Tensor *cpu_saved_var) {
+    InitConstantTensor<float>({channels_}, static_cast<float>(0.0f), cpu_mean);
+    InitConstantTensor<float>({channels_}, static_cast<float>(1.0f), cpu_var);
+    InitConstantTensor<float>({channels_}, static_cast<float>(0.0f),
+                              cpu_saved_mean);
+    InitConstantTensor<float>({channels_}, static_cast<float>(0.0f),
+                              cpu_saved_var);
+  }
+
+  void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean,
+                       Tensor *cpu_var, Tensor *cpu_saved_mean,
+                       Tensor *cpu_saved_var, Tensor *cpu_y,
+                       Tensor *cpu_reserve_space) {
+    InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
+    ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean,
+                            cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y,
+                            cpu_reserve_space);
+  }
+
+  // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
+  void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean,
+                    Tensor *cpu_var, Tensor *cpu_saved_mean,
+                    Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) {
+    framework::Tensor x;
+    framework::Tensor sum;
+    framework::Tensor sum_of_square;
+    framework::Tensor bn_scale;
+    framework::Tensor bn_bias;
+
+    auto place = ctx.GetPlace();
+    TensorCopySync(cpu_x_, place, &x);
+    TensorCopySync(cpu_sum_, place, &sum);
+    TensorCopySync(cpu_sum_of_square_, place, &sum_of_square);
+    TensorCopySync(cpu_bn_scale_, place, &bn_scale);
+    TensorCopySync(cpu_bn_bias_, place, &bn_bias);
+
+    bn_scale.Resize({1, 1, 1, channels_});
+    bn_bias.Resize({1, 1, 1, channels_});
+
+    T *x_ptr = x.data<T>();
+    float *sum_ptr = sum.data<float>();
+    float *sum_of_square_ptr = sum_of_square.data<float>();
+    float *bn_scale_ptr = bn_scale.data<float>();
+    float *bn_bias_ptr = bn_bias.data<float>();
+
+    framework::Tensor mean;
+    framework::Tensor var;
+    framework::Tensor saved_mean;
+    framework::Tensor saved_var;
+    framework::Tensor equiv_scale;
+    framework::Tensor equiv_bias;
+    framework::Tensor y;
+    framework::Tensor bitmask;
+
+    InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
+    TensorCopySync(*cpu_mean, place, &mean);
+    TensorCopySync(*cpu_var, place, &var);
+
+    mean.Resize({1, 1, 1, channels_});
+    var.Resize({1, 1, 1, channels_});
+
+    float *mean_ptr = mean.data<float>();
+    float *var_ptr = var.data<float>();
+    float *saved_mean_ptr =
+        saved_mean.mutable_data<float>({1, 1, 1, channels_}, place);
+    float *saved_var_ptr =
+        saved_var.mutable_data<float>({1, 1, 1, channels_}, place);
+    T *equiv_scale_ptr =
+        equiv_scale.mutable_data<T>({1, 1, 1, channels_}, place);
+    T *equiv_bias_ptr = equiv_bias.mutable_data<T>({1, 1, 1, channels_}, place);
+    T *y_ptr =
+        y.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
+
+    // bitmask
+    int c = channels_;
+    int64_t nhw = ele_count_;
+    int32_t c_int32_elems = ((c + 63) & ~63) / 32;
+    int32_t nhw_int32_elems = (nhw + 31) & ~31;
+    int32_t *bitmask_ptr = bitmask.mutable_data<int32_t>(
+        {nhw_int32_elems, c_int32_elems, 1}, place);
+
+    auto data_shape = framework::vectorize<int>(x.dims());
+    auto param_shape = framework::vectorize<int>(bn_scale.dims());
+    auto bitmask_shape = framework::vectorize<int>(bitmask.dims());
+
+    // 1. BN Stats Finalize
+    op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape);
+    bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr,
+                  saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr,
+                  equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_,
+                  true);
+
+    // 2. Scale Bias + Relu (not fused add)
+    std::string act_type = "";
+    op::CudnnScaleBiasAddRelu<T> sbar_op(
+        ctx, act_type, false, false, data_shape, param_shape, bitmask_shape);
+    sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr,
+                    bitmask_ptr);
+
+    TensorCopySync(mean, platform::CPUPlace(), cpu_mean);
+    TensorCopySync(var, platform::CPUPlace(), cpu_var);
+    TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean);
+    TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var);
+    TensorCopySync(y, platform::CPUPlace(), cpu_y);
+    TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask);
+  }
+
+ private:
+  int batch_size_;
+  int height_;
+  int width_;
+  int channels_;
+  int ele_count_;
+
+  // Forward input
+  framework::Tensor cpu_x_;
+  framework::Tensor cpu_sum_;
+  framework::Tensor cpu_sum_of_square_;
+  framework::Tensor cpu_bn_scale_;
+  framework::Tensor cpu_bn_bias_;
+
+  double eps_ = 1e-5;
+  float momentum_ = 0.9;
+};
+
+TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) {
+  int batch_size = 4;
+  int height = 8;
+  int width = 8;
+  int channels = 64;
+  FLAGS_cudnn_batchnorm_spatial_persistent = true;
+  CudnnBNAddReluTester<paddle::platform::float16> test(batch_size, height,
+                                                       width, channels);
+  test.CheckForward(2e-3);
+}
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
new file mode 100644
index 0000000000000..7d4b24cd4fc3d
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+namespace dynload = platform::dynload;
+template <typename T>
+using BatchNormParamType =
+    typename platform::CudnnDataType<T>::BatchNormParamType;
+
+#if CUDNN_VERSION >= 8000
+
+template <typename T>
+struct BNStatsFinalizeArgs {
+  BNStatsFinalizeArgs() {
+    dtype = platform::CudnnDataType<T>::type;
+    param_dtype = platform::CudnnDataType<BatchNormParamType<T>>::type;
+    format = CUDNN_TENSOR_NHWC;
+  }
+
+  void Set(const std::vector<int> &param_shape) {
+    PADDLE_ENFORCE_EQ(
+        param_shape.size(), 4U,
+        platform::errors::InvalidArgument(
+            "The size of param_shape is expected to 4. But recieved "
+            "param_shape's size is %d, param_shape is [%s].",
+            param_shape.size(), framework::make_ddim(param_shape)));
+
+    in_desc.set(param_shape, format, param_dtype);
+    out_desc.set(param_shape, format, dtype);
+  }
+
+  cudnnDataType_t dtype;
+  cudnnDataType_t param_dtype;
+  cudnnTensorFormat_t format;
+
+  platform::TensorDescriptor in_desc;
+  platform::TensorDescriptor out_desc;
+};
+
+template <typename T>
+class CudnnBNStatsFinalize {
+ public:
+  CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx,
+                       const std::vector<int> &param_shape)
+      : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING),
+        inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) {
+    args_.Set(param_shape);
+  }
+  ~CudnnBNStatsFinalize() {}
+
+  void Forward(const platform::CUDADeviceContext &ctx, float *sum_ptr,
+               float *sum_of_squares_ptr, float *scale_ptr, float *bias_ptr,
+               float *saved_mean_ptr, float *saved_invstd_ptr,
+               float *running_mean_ptr, float *running_var_ptr,
+               T *equiv_scale_ptr, T *equiv_bias_ptr, double eps,
+               float momentum, int64_t ele_count, bool is_train) {
+    if (is_train) {
+      TrainInit(ctx);
+    } else {
+      InferenceInit(ctx);
+    }
+    auto &op = is_train ? train_op_ : inference_op_;
+
+    // Set variant_param for both inference_op_ and train_op_
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr);
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr);
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr);
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_VAR, running_var_ptr);
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, equiv_scale_ptr);
+    op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, equiv_bias_ptr);
+    op.SetOpVariantParamAttrPtr<double>(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps);
+
+    // Set extra variant_param only for train_op_:
+    if (is_train) {
+      op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr);
+      op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr);
+      op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr);
+      op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, saved_invstd_ptr);
+      double avg_factor = 1.0 - momentum;
+      op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT,
+                                  &ele_count);
+      op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR,
+                                  &avg_factor);
+    }
+    // fused op execute
+    auto handle = ctx.cudnn_handle();
+    op.Execute(handle);
+  }
+
+ private:
+  void TrainInit(const platform::CUDADeviceContext &ctx) {
+    // Set constant_param for train op
+    train_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER,
+         CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER,
+         CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER,
+         CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER,
+         CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER,
+         CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER,
+         CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    // Set input and output desc for train op
+    train_op_.SetOpConstParamDesc(
+        {CUDNN_PARAM_YSTATS_DESC, CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC},
+        args_.in_desc.desc());
+    train_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
+                                  args_.out_desc.desc());
+
+    // Get workspace
+    auto handle = ctx.cudnn_handle();
+    train_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
+                                  CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
+    // Check workspace size, also creates plan.
+    size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle);
+    PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U,
+                      platform::errors::InvalidArgument(
+                          "Unexpected non-zero workspace size for "
+                          "CudnnBNStatsFinalize."));
+    train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
+                                       static_cast<void *>(nullptr));
+    train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
+                                       &workspace_size_bytes);
+  }
+
+  void InferenceInit(const platform::CUDADeviceContext &ctx) {
+    // Set constant_param for inference op
+    inference_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER,
+         CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER,
+         CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER,
+         CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    // Set input and output desc for inference op
+    inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC,
+                                      args_.in_desc.desc());
+    inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
+                                      args_.out_desc.desc());
+
+    // Get workspace
+    auto handle = ctx.cudnn_handle();
+    inference_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
+                                      CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
+    // Check workspace size, also creates plan.
+    size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle);
+    PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U,
+                      platform::errors::InvalidArgument(
+                          "Unexpected non-zero workspace size for "
+                          "CudnnBNStatsFinalize."));
+    inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
+                                           static_cast<void *>(nullptr));
+    inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE,
+                                           &workspace_size_bytes);
+  }
+
+  BNStatsFinalizeArgs<T> args_;
+  CudnnFusionOp train_op_;
+  CudnnFusionOp inference_op_;
+};
+#endif
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
new file mode 100644
index 0000000000000..2fdb3635e2e14
--- /dev/null
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -0,0 +1,292 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h"
+#include "paddle/fluid/platform/cudnn_desc.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+namespace dynload = platform::dynload;
+template <typename T>
+using BatchNormParamType =
+    typename platform::CudnnDataType<T>::BatchNormParamType;
+
+#if CUDNN_VERSION >= 8000
+
+template <typename T>
+struct ScaleBiasAddReluArgs {
+  ScaleBiasAddReluArgs() {
+    dtype = platform::CudnnDataType<T>::type;
+    param_dtype = platform::CudnnDataType<BatchNormParamType<T>>::type;
+    format = CUDNN_TENSOR_NHWC;
+  }
+
+  void Set(const std::string &act_type, const std::vector<int> &data_shape,
+           const std::vector<int> &param_shape,
+           const std::vector<int> &bitmask_shape) {
+    PADDLE_ENFORCE_EQ(
+        data_shape.size(), 4U,
+        platform::errors::InvalidArgument(
+            "The size of data_shape is expected to 4. But recieved "
+            "data_shape's size is %d, data_shape is [%s].",
+            data_shape.size(), framework::make_ddim(data_shape)));
+    PADDLE_ENFORCE_EQ(
+        param_shape.size(), 4U,
+        platform::errors::InvalidArgument(
+            "The size of param_shape is expected to 4. But recieved "
+            "param_shape's size is %d, param_shape is [%s].",
+            param_shape.size(), framework::make_ddim(param_shape)));
+    PADDLE_ENFORCE_EQ(
+        bitmask_shape.size(), 3U,
+        platform::errors::InvalidArgument(
+            "The size of bitmask_shape is expected to 3. But recieved "
+            "bitmask_shape's size is %d, bitmask_shape is [%s].",
+            bitmask_shape.size(), framework::make_ddim(bitmask_shape)));
+
+    in_desc.set(data_shape, format, dtype);
+    out_desc.set(data_shape, format, dtype);
+    equiv_scale_bias_desc.set(param_shape, format, dtype);
+    scale_bias_mean_var_desc.set(param_shape, format, param_dtype);
+    bitmask_desc.set(bitmask_shape, format, CUDNN_DATA_INT32);
+    // set activation desc
+    cudnnActivationMode_t mode = CUDNN_ACTIVATION_IDENTITY;
+    if (act_type != "") {
+      PADDLE_ENFORCE_EQ(
+          act_type, "relu",
+          platform::errors::InvalidArgument(
+              "Only relu activation supported in normalized convolution."));
+      mode = CUDNN_ACTIVATION_RELU;
+    }
+    double dummy_clip = 0.0;
+    activation_desc.set(mode, dummy_clip);
+  }
+
+  cudnnDataType_t dtype;
+  cudnnDataType_t param_dtype;
+  cudnnTensorFormat_t format;
+
+  platform::TensorDescriptor in_desc;
+  platform::TensorDescriptor out_desc;
+  platform::TensorDescriptor equiv_scale_bias_desc;
+  platform::TensorDescriptor scale_bias_mean_var_desc;
+  platform::TensorDescriptor bitmask_desc;
+  platform::ActivationDescriptor activation_desc;
+};
+
+template <typename T>
+class CudnnScaleBiasAddRelu {
+ public:
+  CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx,
+                        const std::string &act_type, bool fused_add,
+                        bool has_shortcut, const std::vector<int> &data_shape,
+                        const std::vector<int> &param_shape,
+                        const std::vector<int> &bitmask_shape)
+      : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK),
+        bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) {
+    fused_add_ = fused_add;
+    has_shortcut_ = has_shortcut;
+    args_.Set(act_type, data_shape, param_shape, bitmask_shape);
+  }
+
+  ~CudnnScaleBiasAddRelu() {}
+
+  void Forward(const platform::CUDADeviceContext &ctx, T *x_ptr, T *x_scale_ptr,
+               T *x_bias_ptr, T *out_ptr, int32_t *bitmask_ptr,
+               T *z_ptr = nullptr, T *z_scale_ptr = nullptr,
+               T *z_bias_ptr = nullptr) {
+    ForwardInit(ctx);
+    auto handle = ctx.cudnn_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle);
+    // Set variant_param
+    // input ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr);
+    if (has_shortcut_) {
+      fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr);
+      fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr);
+      fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr);
+    } else {
+      if (fused_add_) {
+        fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr);
+      }
+    }
+
+    fwd_op_.SetOpVariantParamAttrPtr(
+        CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_);
+
+    // output ptr
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr);
+    fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);
+
+    workspace_handle.RunFunc(
+        [&](void *workspace_ptr) {
+          // workspace ptr
+          fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
+          // workspace ptr
+          fwd_op_.Execute(handle);
+        },
+        fwd_workspace_byte_);
+  }
+
+  void Backward(const platform::CUDADeviceContext &ctx, T *dy_ptr, T *x_ptr,
+                float *scale_ptr, float *bias_ptr, float *saved_mean_ptr,
+                float *saved_invstd_ptr, int32_t *bitmask_ptr, T *dx_ptr,
+                T *dz_ptr, float *dscale_ptr, float *dbias_ptr, double eps) {
+    BackwardInit(ctx);
+    auto handle = ctx.cudnn_handle();
+    auto workspace_handle = ctx.cudnn_workspace_handle();
+    bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle);
+    // Set variant_param
+    // input ptr
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD,
+                                     saved_invstd_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr);
+
+    bwd_op_.SetOpVariantParamAttrPtr(
+        CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &bwd_workspace_byte_);
+
+    // output ptr
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DXDATA, dx_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DSCALE, dscale_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr);
+    bwd_op_.SetOpVariantParamAttrPtr<double>(CUDNN_SCALAR_DOUBLE_BN_EPSILON,
+                                             &eps);
+    if (has_shortcut_ || fused_add_) {
+      bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr);
+    }
+
+    workspace_handle.RunFunc(
+        [&](void *workspace_ptr) {
+          // workspace ptr
+          bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr);
+          // workspace ptr
+          bwd_op_.Execute(handle);
+        },
+        bwd_workspace_byte_);
+  }
+
+ private:
+  void ForwardInit(const platform::CUDADeviceContext &ctx) {
+    // Set constant_param
+    fwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER,
+         CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, CUDNN_PARAM_YDATA_PLACEHOLDER,
+         CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    if (has_shortcut_) {
+      fwd_op_.SetOpConstParamAttr(
+          {CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER,
+           CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER},
+          CUDNN_PTR_16B_ALIGNED);
+    } else if (fused_add_) {
+      fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER,
+                                  CUDNN_PTR_16B_ALIGNED);
+    }
+
+    // input desc
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
+    if (has_shortcut_ || fused_add_) {
+      fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc());
+    }
+
+    // equiv scale/bias desc
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC,
+                                args_.equiv_scale_bias_desc.desc());
+    if (has_shortcut_) {
+      fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC,
+                                  args_.equiv_scale_bias_desc.desc());
+    }
+
+    // output desc
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc());
+
+    // bitmask desc
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC,
+                                args_.bitmask_desc.desc());
+
+    // activation desc
+    fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC,
+                                args_.activation_desc.desc());
+
+    // others
+    fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
+                                CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
+  }
+
+  void BackwardInit(const platform::CUDADeviceContext &ctx) {
+    // Set constant_param
+    bwd_op_.SetOpConstParamAttr(
+        {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER,
+         CUDNN_PARAM_DXDATA_PLACEHOLDER, CUDNN_PARAM_BN_SCALE_PLACEHOLDER,
+         CUDNN_PARAM_BN_BIAS_PLACEHOLDER, CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER,
+         CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER,
+         CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER,
+         CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER},
+        CUDNN_PTR_16B_ALIGNED);
+    if (has_shortcut_ || fused_add_) {
+      bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER,
+                                  CUDNN_PTR_16B_ALIGNED);
+    }
+
+    // input desc
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc());
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc());
+    if (has_shortcut_ || fused_add_) {
+      bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc());
+    }
+
+    // scale/bias/mean/var desc for backward
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC,
+                                args_.scale_bias_mean_var_desc.desc());
+
+    // output desc
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DYDESC, args_.out_desc.desc());
+
+    // bitmask desc
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC,
+                                args_.bitmask_desc.desc());
+
+    // activation desc
+    bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC,
+                                args_.activation_desc.desc());
+
+    // others
+    bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE,
+                                CUDNN_BATCHNORM_SPATIAL_PERSISTENT);
+  }
+
+  bool fused_add_ = false;
+  bool has_shortcut_ = false;
+  size_t fwd_workspace_byte_;
+  size_t bwd_workspace_byte_;
+  ScaleBiasAddReluArgs<T> args_;
+  CudnnFusionOp fwd_op_;
+  CudnnFusionOp bwd_op_;
+};
+#endif
+}  // namespace operators
+}  // namespace paddle

From 9b987b3d95dd6b29f0fb03f4d96e9398c67afe47 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Sun, 10 Oct 2021 20:59:55 -0500
Subject: [PATCH 38/80] Add skip case for conv2d convert test  (#36301)

---
 .../inference/test_trt_convert_conv2d_transpose.py | 14 ++++++++++++--
 .../inference/test_trt_convert_depthwise_conv2d.py | 11 ++++++++++-
 .../test_trt_convert_depthwise_conv2d_transpose.py | 12 +++++++++++-
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index 82dd492b5275f..2c8f2592a737c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -173,7 +173,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False), (1e-5, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-5)
@@ -185,7 +185,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True), (1e-5, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-5, 1e-5)
@@ -214,6 +214,16 @@ def teller2(program_config, predictor_config):
             "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
         )
 
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When precisionType is int8 without relu op, output is different between Trt and Paddle."
+        )
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
index e6b3aa30bf896..fc2358bb11636 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py
@@ -165,7 +165,6 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
-
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(attrs,
@@ -190,6 +189,16 @@ def teller1(program_config, predictor_config):
             "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op."
         )
 
+        def teller2(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When precisionType is int8 without relu op, output is different between Trt and Paddle."
+        )
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
index 473925c6cdb79..2fcd2bf5aca97 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py
@@ -137,7 +137,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False), (1e-5, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), (1e-5, 1e-5)
@@ -178,6 +178,16 @@ def teller2(program_config, predictor_config):
             "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle."
         )
 
+        def teller3(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When precisionType is int8 without relu op, output is different between Trt and Paddle."
+        )
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()

From 5690666ce60baaee84fb92583bf10a259a8cd385 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@163.com>
Date: Mon, 11 Oct 2021 10:23:17 +0800
Subject: [PATCH 39/80] Add use_cinn Flag and RunFromCinn in PE (#36107)

Add use_cinn flag and use it to control whether we run PaddlePaddle using CINN.

Also add:

Replace PaddlePaddle graph with a CINN graph in a pass
PE Method to feed data and run the graph by CINN
---
 paddle/fluid/framework/CMakeLists.txt         |  2 +-
 paddle/fluid/framework/details/CMakeLists.txt |  2 +-
 .../fluid/framework/details/build_strategy.cc |  7 ++-
 paddle/fluid/framework/ir/CMakeLists.txt      |  2 +
 .../fluid/framework/ir/paddle_to_cinn_pass.cc | 31 ++++++++++
 .../fluid/framework/ir/paddle_to_cinn_pass.h  | 30 ++++++++++
 .../framework/ir/paddle_to_cinn_pass_test.cc  | 40 +++++++++++++
 .../framework/paddle2cinn/cinn_runner.cc      | 15 +++++
 .../fluid/framework/paddle2cinn/cinn_runner.h | 12 +++-
 .../framework/paddle2cinn/cinn_runner_test.cc | 11 ++--
 paddle/fluid/framework/parallel_executor.cc   | 36 ++++++++++++
 paddle/fluid/framework/parallel_executor.h    |  5 ++
 paddle/fluid/platform/flags.cc                | 10 ++++
 paddle/fluid/pybind/pybind.cc                 | 12 ++++
 python/paddle/fluid/executor.py               | 16 +++++-
 .../test_parallel_executor_run_cinn.py        | 56 +++++++++++++++++++
 16 files changed, 277 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.cc
 create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.h
 create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 67073350d5a8a..6e57b829ade4e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -351,7 +351,7 @@ target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_h
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor
         graph build_strategy bind_threaded_ssa_graph_executor collective_helper
-        fast_threaded_ssa_graph_executor variable_helper)
+        fast_threaded_ssa_graph_executor variable_helper cinn_runner)
 
 cc_library(executor_cache SRCS executor_cache.cc DEPS parallel_executor)
 if(WITH_PSCORE)
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 72f7f0e6011c1..ad81b48847af9 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass
     fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass
     sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass
-    fix_op_run_order_pass)
+    paddle_to_cinn_pass fix_op_run_order_pass)
 if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 0d55882953db3..a55b809055f3e 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -19,8 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_printer.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h"
 
-DECLARE_bool(use_mkldnn);
 DECLARE_bool(convert_all_blocks);
+DECLARE_bool(use_cinn);
+DECLARE_bool(use_mkldnn);
 
 namespace paddle {
 namespace framework {
@@ -71,6 +72,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     // Note: This pass is used to check whether the multi_device_graph is right.
     AppendPass("multi_devices_check_pass");
 
+    // Note: This pass is used to enable cinn.
+    if (FLAGS_use_cinn) {
+      AppendPass("paddle_to_cinn_pass");
+    }
     SetCollectiveContext();
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 99c691e6cf6f7..6f5f27400752d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -59,6 +59,7 @@ cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
 cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper)
 
 pass_library(graph_to_program_pass base)
+pass_library(paddle_to_cinn_pass base DEPS cinn_runner)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base DEPS string_helper)
 pass_library(fc_fuse_pass inference)
@@ -142,6 +143,7 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
+cc_test(paddle_to_cinn_pass_test SRCS paddle_to_cinn_pass_test.cc DEPS paddle_to_cinn_pass proto_desc)
 cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass)
diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc
new file mode 100644
index 0000000000000..fbf2cfb8d41d6
--- /dev/null
+++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h"
+
+#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void PaddleToCinnPass::ApplyImpl(ir::Graph* graph) const {
+  paddle2cinn::CinnRunner::GetInstance()->ReplaceWithCinn(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(paddle_to_cinn_pass, paddle::framework::ir::PaddleToCinnPass);
diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.h b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h
new file mode 100644
index 0000000000000..f3b9bd21ebf9c
--- /dev/null
+++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class PaddleToCinnPass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc
new file mode 100644
index 0000000000000..49d2ce295f385
--- /dev/null
+++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h"
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+TEST(PaddleToCinnPassTest, TodoTest) {
+  ProgramDesc program;
+  Graph graph(program);
+
+  auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
+      "paddle_to_cinn_pass");
+
+  pass->Apply(&graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(paddle_to_cinn_pass);
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc
index de5af910c99ad..ba90095cae679 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc
@@ -15,6 +15,8 @@
 #include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
 
 #include <map>
+#include <memory>
+#include <mutex>
 
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/scope.h"
@@ -26,6 +28,19 @@ namespace paddle2cinn {
 
 using ir::Graph;
 
+std::once_flag CinnRunner::get_instance_once_flag_;
+std::shared_ptr<CinnRunner> CinnRunner::instance_;
+
+std::shared_ptr<CinnRunner> CinnRunner::GetInstance() {
+  std::call_once(get_instance_once_flag_,
+                 [&]() { instance_.reset(new CinnRunner()); });
+  return instance_;
+}
+
+void CinnRunner::ReplaceWithCinn(Graph* graph) {
+  // TODO(zhhsplendid): call CINN Api when it is ready
+}
+
 std::map<std::string, FetchType*> CinnRunner::Run(
     const Graph& graph, Scope* scope,
     std::map<std::string, const LoDTensor*>* feed_targets) {
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.h b/paddle/fluid/framework/paddle2cinn/cinn_runner.h
index 5f63d64545ff7..23d9565d2f392 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.h
@@ -16,6 +16,7 @@
 
 #include <map>
 #include <memory>
+#include <mutex>
 #include <unordered_map>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -36,15 +37,24 @@ namespace paddle2cinn {
 // cache.
 class CinnRunner {
  public:
-  CinnRunner() {}
   ~CinnRunner() {}
 
+  // Singleton
+  static std::shared_ptr<CinnRunner> GetInstance();
+
+  // Replace Paddle graph with some CINN subgraphs/ops
+  void ReplaceWithCinn(ir::Graph* graph);
+
   // Feed LoDTensors to tun CINN compiled object and return fetched result
   std::map<std::string, FetchType*> Run(
       const ir::Graph& graph, Scope* scope,
       std::map<std::string, const LoDTensor*>* feed_targets);
 
  private:
+  CinnRunner() {}
+
+  static std::once_flag get_instance_once_flag_;
+  static std::shared_ptr<CinnRunner> instance_;
   std::unordered_map<CinnCacheKey, std::shared_ptr<CinnCompiledObject>,
                      CinnCacheKey::Hash>
       cache_;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc
index 88aca0bd66b37..c02b994c147ca 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "gtest/gtest.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
+
+#include <memory>
 
+#include "gtest/gtest.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
@@ -32,8 +34,9 @@ TEST(CinnRunnerTest, TodoTest) {
   Scope empty_scope;
   std::map<std::string, const LoDTensor*> empty_feed;
 
-  CinnRunner cinn_runner;
-  cinn_runner.Run(empty_graph, &empty_scope, &empty_feed);
+  std::shared_ptr<CinnRunner> cinn_runner = CinnRunner::GetInstance();
+  cinn_runner->ReplaceWithCinn(&empty_graph);
+  cinn_runner->Run(empty_graph, &empty_scope, &empty_feed);
 }
 
 }  // namespace paddle2cinn
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index d19ac0b65f4d1..3b80e9c78677d 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h"
+#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/event.h"
@@ -43,6 +44,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
+DECLARE_bool(use_cinn);
 DECLARE_double(eager_delete_tensor_gb);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -943,6 +945,40 @@ void ParallelExecutor::RunWithoutFetch(
   member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false);
 }
 
+FetchResultType ParallelExecutor::RunFromCinn(
+    const std::unordered_map<std::string, LoDTensor> &feed_tensors,
+    const std::vector<std::string> &fetch_names) {
+  // Feed tensor to scope, now only support 1 scope
+  // TODO(zhhsplendid): handle multiple scope
+  size_t scope_id = 0;
+  std::map<std::string, const LoDTensor *> cinn_input_tensors;
+  for (auto &name_tensor_pair : feed_tensors) {
+    bool is_persistable = member_->IsPersistable(name_tensor_pair.first);
+    if (!is_persistable) {
+      member_->SetSkipMemoryReuse(scope_id, name_tensor_pair.first);
+    }
+    Scope *feed_scope = is_persistable ? member_->local_scopes_[scope_id]
+                                       : member_->local_exec_scopes_[scope_id];
+    Variable *feed_var = feed_scope->Var(name_tensor_pair.first);
+    LoDTensor *trg = feed_var->GetMutable<LoDTensor>();
+    trg->ShareDataWith(name_tensor_pair.second);
+    trg->set_lod(name_tensor_pair.second.lod());
+
+    cinn_input_tensors[name_tensor_pair.first] = trg;
+  }
+
+  // TODO(zhhsplendid): get correct API after CINN API is ready
+  // now only return empty fetch result;
+  std::shared_ptr<paddle2cinn::CinnRunner> cinn_runner =
+      paddle2cinn::CinnRunner::GetInstance();
+
+  cinn_runner->Run(Graph(), member_->local_exec_scopes_[scope_id],
+                   &cinn_input_tensors);
+
+  paddle::framework::FetchResultType fetches = FetchList(fetch_names.size());
+  return fetches;
+}
+
 void ParallelExecutor::SkipMemoryReuse(
     size_t scope_idx, const std::vector<std::string> &skip_vars) {
   for (auto &var_name : skip_vars) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 78774f0489638..f908ce3f01393 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -92,6 +93,10 @@ class ParallelExecutor {
 
   void RunWithoutFetch(const std::vector<std::string> &skip_eager_vars);
 
+  FetchResultType RunFromCinn(
+      const std::unordered_map<std::string, LoDTensor> &feed_tensors,
+      const std::vector<std::string> &fetch_names);
+
   void ResetOpHandleScopeMapOfGraphs(
       const std::unordered_map<Scope *, Scope *> &scope_map);
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 7a7666665511f..18636f6f84278 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -681,6 +681,16 @@ PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
 
+/**
+ * CINN related FLAG
+ * Name: FLAGS_use_cinn
+ * Since Version: 2.3
+ * Value Range: bool, default=false
+ * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN
+ */
+PADDLE_DEFINE_EXPORTED_bool(
+    use_cinn, false, "It controls whether to run PaddlePaddle using CINN");
+
 DEFINE_int32(record_pool_max_size, 2000000,
              "SlotRecordDataset slot record pool max size");
 DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num");
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f58c2a5db381c..80350abb4fe21 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -3293,6 +3293,18 @@ All parameter, weight, gradient are variables in Paddle.
                    BOOST_GET(paddle::framework::FetchUnmergedList, ret)));
              }
            })
+      .def("run_from_cinn",
+           [](ParallelExecutor &self,
+              const std::unordered_map<std::string, LoDTensor> &feed_tensors,
+              const std::vector<std::string> &fetch_names) -> py::object {
+             paddle::framework::FetchResultType ret;
+             {
+               pybind11::gil_scoped_release release;
+               ret = self.RunFromCinn(feed_tensors, fetch_names);
+             }
+             return py::cast(
+                 std::move(BOOST_GET(paddle::framework::FetchList, ret)));
+           })
       .def("device_count", &ParallelExecutor::DeviceCount);
 
   BindFleetWrapper(&m);
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 8c118f31cbe87..bea5b29ecafa6 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -23,7 +23,8 @@
 from .wrapped_decorator import signature_safe_contextmanager
 import six
 from .data_feeder import convert_dtype
-from .framework import Program, default_main_program, Variable, Operator, convert_np_dtype_to_dtype_
+from .framework import Program, default_main_program, Variable, Operator
+from .framework import convert_np_dtype_to_dtype_, get_flags
 from . import core
 from . import unique_name
 from . import compiler
@@ -1016,7 +1017,16 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                     check_feed_shape_type(var, feed_tensor, exe.device_count())
                 feed_tensor_dict[feed_name] = feed_tensor
 
-            exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
+            #TODO(zhhsplendid): handle other feed data format case for CINN
+            use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"]
+            if use_cinn:
+                fetch_var_names = list(map(_to_name_str, fetch_list))
+                fetch_tensors = exe.run_from_cinn(
+                    feed_tensor_dict, fetch_var_names)._move_to_list()
+                return as_numpy(
+                    fetch_tensors) if return_numpy else fetch_tensors
+            else:
+                exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict)
         elif isinstance(feed, list) or isinstance(feed, tuple):
             res = list()
             for i, each in enumerate(feed):
@@ -1036,6 +1046,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                         check_feed_shape_type(var, tensor)
                     res_dict[feed_name] = tensor
                 res.append(res_dict)
+
+            use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"]
             exe.feed_tensors_into_local_scopes(res)
 
         if hasattr(program._program, 'lr_sheduler'):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
new file mode 100644
index 0000000000000..e8b1d838261f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import unittest
+
+paddle.enable_static()
+
+
+class TestParallelExecutorRunCinn(unittest.TestCase):
+    def test_run_from_cinn(self):
+        paddle.set_flags({'FLAGS_use_cinn': True})
+
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            data = paddle.static.data(
+                name='X', shape=[None, 1], dtype='float32')
+            prediction = paddle.static.nn.fc(data, 2)
+            loss = paddle.mean(prediction)
+            adam = paddle.optimizer.Adam()
+            adam.minimize(loss)
+
+        place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_program)
+        compiled_program = paddle.static.CompiledProgram(
+            main_program).with_data_parallel(loss_name=loss.name)
+
+        batch_size = 16
+        x = np.random.random(size=(batch_size, 1)).astype('float32')
+        fetch = exe.run(compiled_program,
+                        feed={'X': x},
+                        fetch_list=[prediction.name],
+                        return_merged=False)
+
+        paddle.set_flags({'FLAGS_use_cinn': False})
+
+
+if __name__ == '__main__':
+    unittest.main()

From 34bd18ff330fa2095338af1da3caa386f63fed60 Mon Sep 17 00:00:00 2001
From: baoachun <962571062@qq.com>
Date: Mon, 11 Oct 2021 10:45:37 +0800
Subject: [PATCH 40/80] add skip case in trt converter ut (#36287)

* add skip case in trt converter ut

* disable group_norm trt plugin
---
 paddle/fluid/inference/tensorrt/op_teller.cc  |   8 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  |   6 -
 .../inference/test_trt_convert_elementwise.py | 135 +++++++++++++-----
 .../test_trt_convert_emb_eltwise_layernorm.py |  12 ++
 .../inference/test_trt_convert_group_norm.py  |  26 +++-
 .../test_trt_convert_multihead_matmul.py      |  31 +++-
 6 files changed, 165 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 5bfd2f1277795..44c001b0bc595 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -48,9 +48,11 @@ struct SimpleOpTypeSetTeller : public Teller {
     int8_teller_set.insert("skip_layernorm");
     int8_teller_set.insert("slice");
 #endif
-#if IS_TRT_VERSION_GE(7130)
-    teller_set.insert("group_norm");
-#endif
+// TODO(baoachun) The group_norm trt plugin will check input's dim
+// not -1 failed when dynamic shape mode.
+// #if IS_TRT_VERSION_GE(7130)
+//     teller_set.insert("group_norm");
+// #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
 #endif
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 69e0075729b0d..d6a1cdb9e68a6 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -65,12 +65,6 @@ nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
 }
 
 int ElementWisePlugin::initialize() TRT_NOEXCEPT {
-  PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
-                    platform::errors::InvalidArgument(
-                        "The dimension of input Y of TRT elementwise op plugin "
-                        "should be greater than 0, but got %d.",
-                        dims_y_.nbDims));
-
   axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_;
   int trimed_nb_dims = dims_y_.nbDims;
   for (; trimed_nb_dims > 0; --trimed_nb_dims) {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 2d18738b614cb..c8cba0f372380 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -32,8 +32,8 @@ def generate_weight():
             return np.random.randn(32).astype(np.float32)
 
         for batch in [1, 2, 4]:
-            for shape in [[32], [batch, 32], [batch, 64, 32],
-                          [batch, 8, 16, 32]]:
+            for shape in [[32], [batch, 32], [batch, 32, 32],
+                          [batch, 32, 16, 32]]:
                 for op_type in ["elementwise_add", "elementwise_mul"]:
                     for axis in [len(shape) - 1, -1]:
                         self.dims = len(shape)
@@ -68,26 +68,27 @@ def generate_weight():
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
+            # The input.dims[1] must be equal to the weight's length.
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [4]}
                 self.dynamic_shape.max_input_shape = {"input_data": [256]}
                 self.dynamic_shape.opt_input_shape = {"input_data": [16]}
             elif self.dims == 2:
-                self.dynamic_shape.min_input_shape = {"input_data": [1, 4]}
-                self.dynamic_shape.max_input_shape = {"input_data": [4, 256]}
-                self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]}
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 32]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]}
             elif self.dims == 3:
-                self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4]}
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 4]}
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [4, 256, 256]
+                    "input_data": [4, 32, 256]
                 }
                 self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 16]}
             elif self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
-                    "input_data": [1, 4, 4, 4]
+                    "input_data": [1, 32, 4, 4]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "input_data": [4, 256, 128, 256]
+                    "input_data": [4, 32, 128, 256]
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [2, 32, 32, 16]
@@ -98,6 +99,11 @@ def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.opt_input_shape = {}
 
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 3
+            return 1, 2
+
         attrs = [
             program_config.ops[i].attrs
             for i in range(len(program_config.ops))
@@ -106,18 +112,52 @@ def clear_dynamic_shape():
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.dims == 2 and len(self.dynamic_shape.max_input_shape) == 0:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output shape are not equal between gpu and tensorrt when input dim is 2."
+        )
+
+        def teller2(program_config, predictor_config):
+            if self.dims == 3:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and tensorrt when input dim is 3.")
+
+        def teller3(program_config, predictor_config):
+            if self.dims == 4:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and tensorrt when input dim is 4.")
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
@@ -245,15 +285,26 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 3), 1e-5
 
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.dims == 2:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output shape are not equal between gpu and tensorrt when input dim is 2."
+        )
+
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
-        if len(inputs['input_data1'].shape) == 1 or len(inputs['input_data2']
-                                                        .shape) == 1:
+        if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape):
             return False
 
         return True
@@ -264,24 +315,27 @@ def generate_input(shape):
 
         input1_shape_list = [[4, 32], [2, 4, 32], [4, 2, 4, 32]]
         input2_shape1_list = [[32], [4, 32], [2, 4, 32]]
-        input2_shape2_list = [[1, 32], [1, 1, 32], [1, 1, 1, 32]]
-        input2_shape3_list = [[1, 32], [1, 4, 32], [4, 32]]
+        input2_shape2_list = [[4, 1], [2, 4, 1], [4, 2, 4, 1]]
+        input2_shape3_list = [[32], [2, 1, 1], [4, 2, 1, 1]]
+        input2_shape4_list = [[32], [4, 32], [4, 1, 1, 1]]
         input2_shape_list = [
-            input2_shape1_list, input2_shape2_list, input2_shape3_list
+            input2_shape1_list, input2_shape2_list, input2_shape3_list,
+            input2_shape4_list
         ]
         axis1_list = [[-1], [1, -1], [1, -1]]
-        axis2_list = [[-1], [-1], [-1]]
-        axis3_list = [[-1], [-1], [2, -1]]
-        axis_list = [axis1_list, axis2_list, axis3_list]
+        axis2_list = [[-1], [0], [0]]
+        axis3_list = [[-1], [0], [0]]
+        axis4_list = [[-1], [-1], [0]]
+        axis_list = [axis1_list, axis2_list, axis3_list, axis4_list]
 
         for i in range(3):
             input1_shape = input1_shape_list[i]
-            for j in range(3):
+            for j in range(4):
                 input2_shape = input2_shape_list[j][i]
                 for op_type in ["elementwise_add", "elementwise_mul"]:
                     for axis in axis_list[j][i]:
-                        self.dims1 = len(input1_shape)
-                        self.dims2 = len(input2_shape)
+                        self.shape1 = input1_shape
+                        self.shape2 = input2_shape
                         dics = [{"axis": axis}]
                         ops_config = [{
                             "op_type": op_type,
@@ -318,16 +372,16 @@ def generate_dynamic_shape(attrs):
             opt_shape = [[32], [32, 32], [32, 32, 32], [32, 32, 32, 32]]
 
             self.dynamic_shape.min_input_shape = {
-                "input_data1": min_shape[self.dims1 - 1],
-                "input_data2": min_shape[self.dims2 - 1]
+                "input_data1": min_shape[len(self.shape1) - 1],
+                "input_data2": min_shape[len(self.shape2) - 1]
             }
             self.dynamic_shape.max_input_shape = {
-                "input_data1": max_shape[self.dims1 - 1],
-                "input_data2": max_shape[self.dims2 - 1]
+                "input_data1": max_shape[len(self.shape1) - 1],
+                "input_data2": max_shape[len(self.shape2) - 1]
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data1": opt_shape[self.dims1 - 1],
-                "input_data2": opt_shape[self.dims2 - 1]
+                "input_data1": opt_shape[len(self.shape1) - 1],
+                "input_data2": opt_shape[len(self.shape2) - 1]
             }
 
         def clear_dynamic_shape():
@@ -342,10 +396,11 @@ def clear_dynamic_shape():
 
         # for static_shape
         clear_dynamic_shape()
-        self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
-        self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        if self.shape1[0] == self.shape2[0]:
+            self.trt_param.precision = paddle_infer.PrecisionType.Float32
+            yield self.create_inference_config(), (1, 3), 1e-5
+            self.trt_param.precision = paddle_infer.PrecisionType.Half
+            yield self.create_inference_config(), (1, 3), 1e-5
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -354,7 +409,19 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 3), 1e-5
 
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.shape1) == 2:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output shape are not equal between gpu and tensorrt when input dim is 2."
+        )
+
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
index f25a3b82476dc..d7b0bcd908085 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py
@@ -252,7 +252,19 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 4), 1e-5
 
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Half and len(
+                    self.dynamic_shape.min_input_shape) != 0:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt when dynamic fp16 mode.")
+
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
index 0224f20ec747e..b6b5aa9dbfe95 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
@@ -114,19 +114,33 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
-        # self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5
-        # self.trt_param.precision = paddle_infer.PrecisionType.Half
-        # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The goup_norm plugin will check dim not -1 failed when dynamic fp16 mode."
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index e772df522b5c5..0b98ab53fcc29 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -38,6 +38,7 @@ def generate_weight2():
             return np.random.randn(768).astype(np.float32)
 
         for batch in [1, 2, 4]:
+            self.batch = batch
             for reshape_shape in [[0, 0, 12, 64]]:
                 for dim1 in [128]:
                     input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
@@ -417,18 +418,40 @@ def clear_dynamic_shape():
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 4), 1e-5
+        yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), 1e-5
+        yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Half:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in fp16 mode.")
+
+        def teller2(program_config, predictor_config):
+            if self.trt_param.precision == paddle_infer.PrecisionType.Float32 and len(
+                    self.dynamic_shape.min_input_shape) != 0 and self.batch > 2:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2."
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 

From 2b7b752a1c8eb5ffd24d06729c4d3d6bcb1f6b1a Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Mon, 11 Oct 2021 11:12:24 +0800
Subject: [PATCH 41/80] add mish trt plugin (#34123)

* add mish trt plugin, compile & install success, run error. test=develop
* modify code according to review
* add TRT_NOEXCEPT for mish trt plugin
* add unittest for mish trt plugin
* remove unnecessary check of mish in op_teller.cc
* fix some problem of trt8
* add check and modify unittest while converting mish to trt plugin
Co-authored-by: dengkaipeng <dengkaipeng@baidu.com>
---
 paddle/fluid/framework/ir/is_test_pass.cc     |   2 +-
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   1 +
 .../inference/tensorrt/convert/mish_op.cc     |  74 ++++++
 .../tensorrt/convert/test_mish_op.cc          |  47 ++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  41 ++-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   1 +
 .../tensorrt/plugin/mish_op_plugin.cu         | 235 ++++++++++++++++++
 .../tensorrt/plugin/mish_op_plugin.h          | 175 +++++++++++++
 .../ir/inference/test_trt_activation_pass.py  |  36 +++
 .../ir/inference/test_trt_convert_mish.py     | 174 +++++++++++++
 11 files changed, 785 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/mish_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py

diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 25bf03f426a1d..a97873e82f455 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
                   "hard_shrink", "hard_sigmoid", "relu6",
                   "soft_relu",   "swish",        "thresholded_relu",
                   "log",         "square",       "softplus",
-                  "softsign",    "silu"};
+                  "softsign",    "silu",         "mish"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 804f035a2e2ca..3136e53e74d09 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1410,6 +1410,7 @@ USE_TRT_CONVERTER(reduce_mean);
 USE_TRT_CONVERTER(tile);
 USE_TRT_CONVERTER(conv3d);
 USE_TRT_CONVERTER(conv3d_transpose);
+USE_TRT_CONVERTER(mish);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index c79915629b70d..f2c7a4b62bbbb 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -17,6 +17,7 @@ nv_library(tensorrt_converter
                 gather_nd_op.cc
                 tile_op.cc
                 conv3d_op.cc
+                mish_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/mish_op.cc b/paddle/fluid/inference/tensorrt/convert/mish_op.cc
new file mode 100644
index 0000000000000..6b646d9935b52
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/mish_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * Mish converter from fluid to tensorRT.
+ */
+class MishOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert fluid Mish op to tensorrt Mish plugin";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    int input_num = op_desc.Input("X").size();
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+
+    const float threshold =
+        op_desc.HasAttr("threshold")
+            ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold"))
+            : 20.0f;
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::MishPluginDynamic* plugin =
+          new plugin::MishPluginDynamic(threshold, with_fp16);
+      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
+    } else {
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::MishPlugin* plugin = new plugin::MishPlugin(threshold, with_fp16);
+      layer = engine_->AddPlugin(&input, input_num, plugin);
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "mish", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(mish, MishOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
new file mode 100644
index 0000000000000..c84c30255fa96
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(mish_op, test_mish) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("mish-X", nvinfer1::Dims3(3, 2, 2));
+  validator.DeclOutputVar("mish-Out", nvinfer1::Dims3(3, 2, 2));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("mish");
+  desc.SetInput("X", {"mish-X"});
+  desc.SetOutput("Out", {"mish-Out"});
+
+  desc.SetAttr("threshold", 20.0f);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(mish);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 44c001b0bc595..7a70ceda60c1f 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -136,7 +136,8 @@ struct SimpleOpTypeSetTeller : public Teller {
                                              "reduce_sum",
                                              "reduce_mean",
                                              "conv3d",
-                                             "conv3d_transpose"};
+                                             "conv3d_transpose",
+                                             "mish"};
 };
 
 bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
@@ -1048,6 +1049,44 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       }
     }
 
+    if (op_type == "mish") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of mish TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "Invalid output Out's size of mish TRT converter. "
+                   "Expected 1, received "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+
+      auto x_var_name = desc.Input("X")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() == 1) {
+        VLOG(3) << "mish op does not support input's dim is 1 in tensorrt.";
+        return false;
+      }
+
+      if (!with_dynamic_shape) {
+        if (x_shape.size() == 2) {
+          VLOG(3) << "mish op does not support input's dim is 2 in tensorrt.";
+          return false;
+        }
+      }
+    }
+
     if (op_type == "roi_align") {
       if (!with_dynamic_shape) {
         VLOG(3) << "TRT roi align plugin only accept the dynamic shape, "
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 311c2312a9f45..e6bcb59fd092c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -9,6 +9,7 @@ nv_library(tensorrt_plugin
            yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
            gather_nd_op_plugin.cu
+           mish_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
new file mode 100644
index 0000000000000..6e268e7b0b330
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu
@@ -0,0 +1,235 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstring>
+#include "glog/logging.h"
+#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+int MishPlugin::initialize() TRT_NOEXCEPT { return 0; }
+
+bool MishPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
+  if (with_fp16_) {
+    return ((type == nvinfer1::DataType::kFLOAT ||
+             type == nvinfer1::DataType::kHALF) &&
+            (format == nvinfer1::PluginFormat::kLINEAR));
+  } else {
+    return ((type == nvinfer1::DataType::kFLOAT) &&
+            (format == nvinfer1::PluginFormat::kLINEAR));
+  }
+}
+
+nvinfer1::Dims MishPlugin::getOutputDimensions(int index,
+                                               const nvinfer1::Dims* in_dims,
+                                               int nb_inputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument(
+                                      "We expect [number of inputs] == 1"
+                                      "in TRT Mish op plugin, but got "
+                                      "[number of inputs] = %d.",
+                                      nb_inputs));
+  PADDLE_ENFORCE_LT(index, this->getNbOutputs(),
+                    platform::errors::InvalidArgument(
+                        "We expect [index] < [number of outputs]"
+                        "in TRT Mish op plugin, but got "
+                        "[index] = %d, [number of outputs] = %d.",
+                        index, this->getNbOutputs()));
+  nvinfer1::Dims const& input_dims = in_dims[0];
+  nvinfer1::Dims output_dims = input_dims;
+  return output_dims;
+}
+
+template <typename T>
+__device__ T kTanh(T x) {
+  return tanh(x);
+}
+
+template <>
+__device__ half kTanh<half>(half x) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  const float tmp = tanhf(__half2float(x));
+  return __float2half(tmp);
+#endif
+}
+
+template <typename T>
+__device__ T kSoftplus(T x, T threshold) {
+  return x > threshold ? x : log(exp(x) + static_cast<T>(1.0f));
+}
+
+template <>
+__device__ half kSoftplus<half>(half x, half threshold) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  return x > threshold ? x : hlog(hexp(x) + static_cast<half>(1.0f));
+#endif
+}
+
+template <typename T>
+__global__ void mish_kernel(float threshold, int n, const T* input, T* output) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    const T in = input[idx];
+    output[idx] = in * kTanh<T>(kSoftplus<T>(in, static_cast<T>(threshold)));
+  }
+}
+
+template <>
+__global__ void mish_kernel<half>(float threshold, int n, const half* input,
+                                  half* output) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    const half in = input[idx];
+    output[idx] =
+        in * kTanh<half>(kSoftplus<half>(in, static_cast<half>(threshold)));
+  }
+#endif
+}
+
+#if IS_TRT_VERSION_LT(8000)
+int MishPlugin::enqueue(int batchSize, const void* const* inputs,
+                        void** outputs,
+#else
+int MishPlugin::enqueue(int batchSize, const void* const* inputs,
+                        void* const* outputs,
+#endif
+                        void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
+  const auto& input_dims = this->getInputDims(0);
+  int num = batchSize;
+  for (int i = 0; i < input_dims.nbDims; i++) {
+    num *= input_dims.d[i];
+  }
+
+  const int block_size = 256;
+  const int grid_size = (num + block_size - 1) / block_size;
+
+  auto type = getDataType();
+  if (type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
+    const float* input = static_cast<const float*>(inputs[0]);
+    float* output = static_cast<float*>(outputs[0]);
+    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
+                                                             input, output);
+  } else if (type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
+    const half* input = static_cast<const half*>(inputs[0]);
+    half* output = static_cast<half*>(outputs[0]);
+    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
+                                                            input, output);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The Mish TRT Plugin's input type should be float or half."));
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+// Dynamic Plugin below.
+int MishPluginDynamic::initialize() TRT_NOEXCEPT {
+  getPluginNamespace();
+  return 0;
+}
+
+size_t MishPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
+  return SerializedSize(threshold_) + SerializedSize(with_fp16_);
+}
+
+void MishPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
+  SerializeValue(&buffer, threshold_);
+  SerializeValue(&buffer, with_fp16_);
+}
+
+nvinfer1::DimsExprs MishPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
+  return inputs[0];
+}
+
+bool MishPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of mish plugin shoule not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  }
+  const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1];
+  // output
+  return in.type == prev.type && in.format == prev.format;
+}
+
+nvinfer1::DataType MishPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The Mish Plugin only has one input, so the "
+                                  "index value should be 0, but get %d.",
+                                  index));
+  return input_types[0];
+}
+
+int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                               const nvinfer1::PluginTensorDesc* output_desc,
+                               const void* const* inputs, void* const* outputs,
+                               void* workspace,
+                               cudaStream_t stream) TRT_NOEXCEPT {
+  auto input_dims = input_desc[0].dims;
+  size_t num = ProductDim(input_dims);
+  const int block_size = 256;
+  const int grid_size = (num + block_size - 1) / block_size;
+
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32";
+    const float* input = static_cast<const float*>(inputs[0]);
+    float* output = static_cast<float*>(outputs[0]);
+    mish_kernel<float><<<grid_size, block_size, 0, stream>>>(threshold_, num,
+                                                             input, output);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16";
+    const half* input = static_cast<const half*>(inputs[0]);
+    half* output = static_cast<half*>(outputs[0]);
+    mish_kernel<half><<<grid_size, block_size, 0, stream>>>(threshold_, num,
+                                                            input, output);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The Mish TRT Plugin's input type should be float or half."));
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
new file mode 100644
index 0000000000000..75390666ea097
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
@@ -0,0 +1,175 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class MishPlugin : public PluginTensorRT {
+ private:
+  float threshold_;
+
+ protected:
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
+    return getBaseSerializationSize() + SerializedSize(threshold_);
+  }
+
+  // TRT will call this func  to serialize the configuration of TRT
+  // It should not be called by users.
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, threshold_);
+  }
+
+ public:
+  explicit MishPlugin(const float threshold, const bool with_fp16)
+      : threshold_(threshold) {
+    with_fp16_ = with_fp16;
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  MishPlugin(void const* serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &threshold_);
+  }
+
+  ~MishPlugin() {}
+  MishPlugin* clone() const TRT_NOEXCEPT override {
+    return new MishPlugin(threshold_, with_fp16_);
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "mish_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) TRT_NOEXCEPT override;
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batchSize, const void* const* inputs, void** outputs,
+#else
+  int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
+#endif
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+};
+
+class MishPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "mish_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    return new MishPlugin(serial_data, serial_length);
+  }
+};
+
+REGISTER_TRT_PLUGIN_V2(MishPluginCreator);
+
+class MishPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit MishPluginDynamic(const float threshold, const bool with_fp16)
+      : threshold_(threshold) {
+    with_fp16_ = with_fp16;
+  }
+  MishPluginDynamic(void const* serialData, size_t serialLength) {
+    DeserializeValue(&serialData, &serialLength, &threshold_);
+    DeserializeValue(&serialData, &serialLength, &with_fp16_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
+    return new MishPluginDynamic(threshold_, with_fp16_);
+  }
+
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "mish_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
+
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) TRT_NOEXCEPT override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const TRT_NOEXCEPT override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT override { delete this; }
+
+ private:
+  float threshold_;
+};
+
+class MishPluginDynamicCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "mish_plugin_dynamic";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
+    auto plugin = new MishPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+};
+
+REGISTER_TRT_PLUGIN_V2(MishPluginDynamicCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 8e196f5081f73..62825caf5185c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -139,6 +139,42 @@ def append_act(self, x):
         return fluid.layers.swish(x)
 
 
+class TensorRTSubgraphPassMishTest(TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False)
+
+    def append_act(self, x):
+        return fluid.layers.mish(x)
+
+
+class TensorRTSubgraphPassMishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False)
+
+    def append_act(self, x):
+        return fluid.layers.mish(x)
+
+
+class TensorRTSubgraphPassDynamicMishFp16SerializeTest(
+        TensorRTSubgraphPassActivationTest):
+    def setUpTensorRTParam(self):
+        self.enable_trt = True
+        self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False)
+        self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam(
+            {
+                'data': [1, 6, 8, 8]
+            }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False)
+
+    def append_act(self, x):
+        return fluid.layers.mish(x)
+
+
 class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
         return fluid.layers.prelu(x, mode='all')
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py
new file mode 100644
index 0000000000000..d223fd529ab17
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertMishTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(batch, dim1, dim2, dim3):
+            shape = [batch]
+            if dim1 != 0:
+                shape.append(dim1)
+            if dim2 != 0:
+                shape.append(dim2)
+            if dim3 != 0:
+                shape.append(dim3)
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [1, 4]:
+            for dim1 in [0, 3]:
+                for dim2 in [0, 16]:
+                    for dim3 in [0, 32]:
+                        for thre in [5.0, 20.0]:
+                            self.dim1 = dim1
+                            self.dim2 = dim2
+                            self.dim3 = dim3
+
+                            if dim1 == 0 and dim2 != 0:
+                                continue
+                            if dim1 == 0 and dim2 == 0 and dim3 != 0:
+                                continue
+
+                            ops_config = [{
+                                "op_type": "mish",
+                                "op_inputs": {
+                                    "X": ["input_data"]
+                                },
+                                "op_outputs": {
+                                    "Out": ["mish_output_data"]
+                                },
+                                "op_attrs": {
+                                    "threshold": thre
+                                }
+                            }]
+
+                            ops = self.generate_op_config(ops_config)
+                            program_config = ProgramConfig(
+                                ops=ops,
+                                weights={},
+                                inputs={
+                                    "input_data": TensorConfig(
+                                        data_gen=partial(generate_input, batch,
+                                                         dim1, dim2, dim3))
+                                },
+                                outputs=["mish_output_data"])
+
+                            yield program_config
+
+    def sample_predictor_configs(self, program_config):
+        def generate_dynamic_shape(attrs):
+            if self.dim1 == 0:
+                self.dynamic_shape.min_input_shape = {"input_data": [1], }
+                self.dynamic_shape.max_input_shape = {"input_data": [4], }
+                self.dynamic_shape.opt_input_shape = {"input_data": [2], }
+            else:
+                if self.dim2 == 0 and self.dim3 == 0:
+                    self.dynamic_shape.min_input_shape = {
+                        "input_data": [1, 1],
+                    }
+                    self.dynamic_shape.max_input_shape = {
+                        "input_data": [4, 64],
+                    }
+                    self.dynamic_shape.opt_input_shape = {
+                        "input_data": [2, 3],
+                    }
+                elif self.dim2 != 0 and self.dim3 != 0:
+                    self.dynamic_shape.min_input_shape = {
+                        "input_data": [1, 1, 1, 1],
+                    }
+                    self.dynamic_shape.max_input_shape = {
+                        "input_data": [4, 64, 128, 128],
+                    }
+                    self.dynamic_shape.opt_input_shape = {
+                        "input_data": [2, 3, 16, 32],
+                    }
+                elif self.dim3 == 0:
+                    self.dynamic_shape.min_input_shape = {
+                        "input_data": [1, 1, 1],
+                    }
+                    self.dynamic_shape.max_input_shape = {
+                        "input_data": [4, 64, 256],
+                    }
+                    self.dynamic_shape.opt_input_shape = {
+                        "input_data": [2, 3, 128],
+                    }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if self.dim1 == 0 and self.dim2 == 0 and self.dim3 == 0:
+                return True
+            return False
+
+        self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
+                           "Trt does not support 1-dimensional input.")
+
+        def teller2(program_config, predictor_config):
+            if (len(self.dynamic_shape.min_input_shape) == 0):
+                if self.dim1 != 0 and self.dim2 == 0 and self.dim3 == 0:
+                    return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_SUPPORT,
+            "Need to repair the case: the output of GPU and tensorrt has diff when the input dimension is 2 in static shape mode."
+        )
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From ea76457c95fd5ab460c768f1d90a640b4b96a429 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E5=AD=A3?= <2042519524@qq.com>
Date: Mon, 11 Oct 2021 11:14:17 +0800
Subject: [PATCH 42/80] fix the hidden method in paddle.distributed.utils file
 (#36210)

---
 python/paddle/distributed/utils.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py
index 6d14b30d18c7f..63585e167e8e3 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils.py
@@ -489,9 +489,6 @@ def __ne__(self, pod):
     def parse_response(self, res_pods):
         pass
 
-    def rank(self):
-        return self.rank
-
     def get_visible_gpus(self):
         r = ""
         for g in self.gpus:

From 2bf82e7598bb319e6b959eb58579d39535c999e7 Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Mon, 11 Oct 2021 11:24:40 +0800
Subject: [PATCH 43/80] fix fft axis (#36321)

fix: `-1` is used when fft's axis is `0`
---
 python/paddle/tensor/fft.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
index 829399d14eaa0..f7990e3f89107 100644
--- a/python/paddle/tensor/fft.py
+++ b/python/paddle/tensor/fft.py
@@ -1340,7 +1340,7 @@ def fft_c2c(x, n, axis, norm, forward, name):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
     _check_normalization(norm)
 
-    axis = axis or -1
+    axis = axis if axis is not None else -1
     _check_fft_axis(x, axis)
     axes = [axis]
     axes = _normalize_axes(x, axes)
@@ -1370,7 +1370,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
     if is_interger(x):
         x = paddle.cast(x, paddle.get_default_dtype())
     _check_normalization(norm)
-    axis = axis or -1
+    axis = axis if axis is not None else -1
     _check_fft_axis(x, axis)
     axes = [axis]
     axes = _normalize_axes(x, axes)
@@ -1409,7 +1409,7 @@ def fft_c2r(x, n, axis, norm, forward, name):
     elif is_floating_point(x):
         x = paddle.cast(x, _real_to_complex_dtype(x.dtype))
     _check_normalization(norm)
-    axis = axis or -1
+    axis = axis if axis is not None else -1
     _check_fft_axis(x, axis)
     axes = [axis]
     axes = _normalize_axes(x, axes)

From 642aaa2e18ed6c7b548fc3b109e8cf6eac4aac63 Mon Sep 17 00:00:00 2001
From: Xiaoxu Chen <chenxx_id@163.com>
Date: Mon, 11 Oct 2021 11:30:12 +0800
Subject: [PATCH 44/80] use unified external error message for cufft api
 (#36114)

---
 cmake/third_party.cmake                    |  4 +--
 paddle/fluid/operators/spectral_op.cu      |  5 ++--
 paddle/fluid/platform/enforce.h            | 14 ++++++++++
 paddle/fluid/platform/enforce_test.cc      | 22 +++++++++++++++-
 paddle/fluid/platform/external_error.proto |  1 +
 tools/externalError/README.md              | 30 +++++++++++++++++-----
 tools/externalError/spider.py              | 29 ++++++++++++++++++++-
 tools/externalError/start.sh               |  2 +-
 8 files changed, 92 insertions(+), 15 deletions(-)

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 892ae270267a7..b3260ba27b072 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -251,8 +251,8 @@ if(WITH_GPU)
         include(external/cub)       # download cub
         list(APPEND third_party_deps extern_cub)
     endif()
-    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
-    file_download_and_uncompress(${URL} "externalError" MD5 061f3b7895aadcbe2c3ed592590f8b10)   # download file externalErrorMsg.tar.gz
+    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
+    file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)   # download file externalErrorMsg.tar.gz
     if(WITH_TESTING)
         # copy externalErrorMsg.pb, just for unittest can get error message correctly.
         set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu
index 9aa5ca39d737e..24dffaad41b5f 100644
--- a/paddle/fluid/operators/spectral_op.cu
+++ b/paddle/fluid/operators/spectral_op.cu
@@ -83,9 +83,7 @@ static inline std::string get_cufft_error_info(cufftResult error) {
 }
 
 static inline void CUFFT_CHECK(cufftResult error) {
-  if (error != CUFFT_SUCCESS) {
-    PADDLE_THROW(platform::errors::External(get_cufft_error_info(error)));
-  }
+  PADDLE_ENFORCE_CUDA_SUCCESS(error);
 }
 
 // This struct is used to easily compute hashes of the
@@ -413,6 +411,7 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
                               ? framework::ToRealType(input.type())
                               : input.type();
   auto fft_type = GetFFTTransformType(input.type(), output.type());
+
   PlanKey Key(framework::vectorize(input.dims()),
               framework::vectorize(output.dims()), signal_size, fft_type,
               value_type);
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index c420a5a64be06..7427060add8b1 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -31,6 +31,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
+#include <cufft.h>
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
@@ -714,6 +715,7 @@ DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
 DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
 DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
 DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
+DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
@@ -751,6 +753,8 @@ inline const char* GetErrorMsgUrl(T status) {
       return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
              "types.html#ncclresult-t";
       break;
+    case platform::proto::ApiType::CUFFT:
+      return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult";
     default:
       return "Unknown type of External API, can't get error message URL!";
       break;
@@ -839,6 +843,7 @@ template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
 template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
 template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
 template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
+template std::string GetExternalErrorMsg<cufftResult_t>(cufftResult_t);
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
 #endif
@@ -899,6 +904,15 @@ inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
   return sout.str();
 }
 
+/*************** CUFFT ERROR ***************/
+inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; }
+
+inline std::string build_nvidia_error_msg(cufftResult_t stat) {
+  std::ostringstream sout;
+  sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
+}
+
 /**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 95a852ad6e92a..c6d5f171ddce4 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/platform/enforce.h"
+
 #include <list>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/platform/enforce.h"
 
 TEST(ENFORCE, OK) {
   PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable(
@@ -418,6 +419,25 @@ TEST(enforce, cuda_success) {
       "negative vector size, for example).To correct: ensure that all the "
       "parameters being passed have valid values"));
 
+  EXPECT_TRUE(CheckCudaStatusSuccess(CUFFT_SUCCESS));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_PLAN, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_ALLOC_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_TYPE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_VALUE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INTERNAL_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_EXEC_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_SETUP_FAILED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_SIZE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_UNALIGNED_DATA, "CUFFT error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUFFT_INCOMPLETE_PARAMETER_LIST, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_DEVICE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_PARSE_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NO_WORKSPACE, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_IMPLEMENTED, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_LICENSE_ERROR, "CUFFT error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error"));
+
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto
index 2094de7e10f69..cbbf803492e64 100644
--- a/paddle/fluid/platform/external_error.proto
+++ b/paddle/fluid/platform/external_error.proto
@@ -24,6 +24,7 @@ enum ApiType {
   CUBLAS = 3;
   CUSOLVER = 4;
   NCCL = 5;
+  CUFFT = 6;
 }
 
 message MessageDesc {
diff --git a/tools/externalError/README.md b/tools/externalError/README.md
index 029efd8cb9491..0c2ac626991da 100644
--- a/tools/externalError/README.md
+++ b/tools/externalError/README.md
@@ -1,9 +1,25 @@
-Usage:
+#### **Introduction for crawling new error message:**
 
-Please run:
-```
-bash start.sh
-```
 
-If you want to update all external error message, you need to run command `bash start.sh` in current directory, 
-and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz
+
+1. add new spider code in spider.py for crawling error message from website. 
+
+2. run `bash start.sh` in current  directory to generate new externalErrorMsg_${date}.tar.gz file, for example `externalErrorMsg_20210928.tar.gz`.
+
+3. upload above tar file into bos https://paddlepaddledeps.bj.bcebos.com **paddlepaddledeps** bucket, and copy download link `${download_url}`. ***\*Be careful not to delete original tar file\****.
+
+4. compute md5 value of above tar file `${md5}`, and modify cmake/third_party.cmake file 
+
+   ```
+   set(URL  "${download_url}" CACHE STRING "" FORCE)
+   file_download_and_uncompress(${URL} "externalError" MD5 ${md5})   
+   ```
+
+   for example:
+
+   ```
+   set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE)
+   file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa)
+   ```
+
+5. commit your changes, and create pull request.
diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py
index a74d82f40ebeb..e07f05f561cb5 100644
--- a/tools/externalError/spider.py
+++ b/tools/externalError/spider.py
@@ -17,8 +17,10 @@
 import urllib.request
 import json
 import collections
-import sys, getopt
+import sys
+import getopt
 import external_error_pb2
+from html.parser import HTMLParser
 
 
 def parsing(externalErrorDesc):
@@ -335,6 +337,31 @@ def parsing(externalErrorDesc):
         _Messages.message = "'%s'. %s" % (error[0], m_message)
     print("End crawling errorMessage for nvidia NCCL API!\n")
 
+    #*************************************************************************************************#
+    #*********************************** CUFFT Error Message **************************************#
+    print("start crawling errorMessage for nvidia CUFFT API--->")
+    url = 'https://docs.nvidia.com/cuda/cufft/index.html#cufftresult'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUFFT
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+
+    class CUFFTHTMLParser(HTMLParser):
+        '''CUFFTHTML Parser
+        '''
+
+        def handle_data(self, data):
+            if 'typedef enum cufftResult_t' in data:
+                for line in data.strip().splitlines()[1:-1]:
+                    status, code, desc = re.split('=|//', line.strip())
+                    _Messages = allMessageDesc.messages.add()
+                    _Messages.code = int(code.strip(' ,'))
+                    _Messages.message = "'%s'. %s" % (status.strip(),
+                                                      desc.strip())
+
+    CUFFTHTMLParser().feed(html)
+
 
 def main(argv):
     try:
diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh
index 32ef63c261268..82715dd47326c 100644
--- a/tools/externalError/start.sh
+++ b/tools/externalError/start.sh
@@ -32,4 +32,4 @@ fi
 protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto
 
 python3.7 spider.py
-tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
+tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb

From 64d08c0e4b141fb951f984c7793180b255a060a9 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 11 Oct 2021 11:43:55 +0800
Subject: [PATCH 45/80] fix bug of upload third party to bos (#36311)

---
 paddle/scripts/paddle_build.bat | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 0283de66ba5af..d675f4fdbdb61 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -395,15 +395,15 @@ if not exist %THIRD_PARTY_PATH% (
         echo Getting third party: extracting ...
         tar -xf %md5%.tar.gz
         if !ERRORLEVEL! EQU 0 ( 
-            echo Get third party from bos successfully
+            echo Get third party from bos successfully.
         ) else (
-            echo Get third party failed, reason: extract failed, will build locally
+            echo Get third party failed, reason: extract failed, will build locally.
         )
         del %md5%.tar.gz
     ) else (
-        echo Get third party failed, reason: download failed, will build locally
+        echo Get third party failed, reason: download failed, will build locally.
     )
-    if not exist %THIRD_PARTY_PATH% ( set UPLOAD_TP_FILE=ON ) 
+    if not exist %THIRD_PARTY_PATH% set UPLOAD_TP_FILE=ON
     cd %work_dir%\%BUILD_DIR%
 ) else (
     echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it.
@@ -540,18 +540,18 @@ if "%UPLOAD_TP_FILE%"=="ON" (
         tar -zcf %md5%.tar.gz %md5%
         if !errorlevel! EQU 0 (
             echo Uploading third_party: uploading ...
-            %PYTHON_ROOT%\python.exe %BCE_FILE% %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul
+            %PYTHON_ROOT%\python.exe !BCE_FILE! %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul
             if !errorlevel! EQU 0 (
-                echo Upload third party to bos paddle-windows/third_party/%sub_dir% successfully 
+                echo Upload third party %md5% to bos paddle-windows/third_party/%sub_dir% successfully.
             ) else (
-                echo Failed upload third party to bos, reason: upload failed
+                echo Failed upload third party to bos, reason: upload failed.
             )
         ) else (
-            echo Failed upload third party to bos, reason: compress failed
+            echo Failed upload third party to bos, reason: compress failed.
         )
         del %md5%.tar.gz
     ) else (
-        echo Failed upload third party to bos, reason: install bce failed
+        echo Failed upload third party to bos, reason: install bce failed.
     )
     cd %work_dir%\%BUILD_DIR%
 )

From 110613256898b2431654ab21cbd0ba869f99ec40 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 11 Oct 2021 12:17:21 +0800
Subject: [PATCH 46/80] [NPU] fix softmax_with_cross_entropy in dygraph,
 test=develop (#36297)

---
 .../operators/softmax_with_cross_entropy_op.cc  | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index 0c2d39e7519ef..78e813edda930 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -54,8 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker
         "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, "
         "where labels is ont-hot."
         "Currently, the tensor is generated and used in npu kernel only. ")
-        .AsIntermediate()
-        .AsDispensable();
+        .AsIntermediate();
 #endif
     AddOutput("Loss",
               "(Tensor, default: Tensor<float>), A tensor in same shape with "
@@ -136,6 +131,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true,
                       platform::errors::InvalidArgument(
                           "Output(Softmax) should be not null."));
+#ifdef PADDLE_WITH_ASCEND_CL
+    PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Backprop) should be not null."));
+#endif
     PADDLE_ENFORCE_EQ(
         ctx->HasOutput("Loss"), true,
         platform::errors::InvalidArgument("Output(Loss) should be not null."));
@@ -225,6 +225,11 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
                       platform::errors::InvalidArgument(
                           "Input(Softmax) should be not null."));
+#ifdef PADDLE_WITH_ASCEND_CL
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Backprop) should be not null."));
+#endif
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Label"), true,
         platform::errors::InvalidArgument("Input(Label) should be not null."));

From 83541fd45eb03d1d86e5403e17fd41274db65ced Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 11 Oct 2021 12:17:56 +0800
Subject: [PATCH 47/80] [NPU] fix set_value, test=develop (#36272)

* [NPU] fix set_value, test=develop

* fix typo, test=develop

* fix typo, test=develop
---
 paddle/fluid/operators/set_value_op_npu.cc    | 464 +++++-------------
 .../unittests/npu/test_set_value_op_npu.py    | 334 ++++++-------
 2 files changed, 274 insertions(+), 524 deletions(-)

diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc
index 3a8d81920f262..e7b124d5bddd6 100644
--- a/paddle/fluid/operators/set_value_op_npu.cc
+++ b/paddle/fluid/operators/set_value_op_npu.cc
@@ -1,8 +1,11 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,291 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/set_value_op.h"
-#include "paddle/fluid/operators/assign_value_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
-#include "paddle/fluid/operators/slice_utils.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace operators {
-template <typename DeviceContext, typename T>
-class SetValueNPUKernel : public framework::OpKernel<T> {
- private:
-  using Vector_Int64 = std::vector<int64_t>;
-  void GetNPUStartEndSteps(const Vector_Int64& start, const Vector_Int64& end,
-                           const Vector_Int64& steps, const Vector_Int64& axes,
-                           const framework::DDim& in_dim,
-                           std::vector<std::vector<int64_t>>& output) const {
-    int rank = in_dim.size();
-    for (int i = 0; i < rank; ++i) {
-      int axis_size = in_dim[i];
-      auto iter = find(axes.begin(), axes.end(), i);
-      if (iter != axes.end()) {
-        int idx = iter - axes.begin();
-        output[0].push_back(start[idx]);  // set as the same as raw input
-        output[1].push_back(end[idx]);
-        output[2].push_back(steps[idx]);
-      } else {
-        output[0].push_back(0);          // begin 0
-        output[1].push_back(axis_size);  // end = last one
-        output[2].push_back(1);          // step = 1
-      }
-    }
-  }
-
-  inline std::vector<int> MininumPadNumberMakeSureLastDimGT8(
-      const std::vector<std::vector<int64_t>>& npu_slice) const {
-    int rank = npu_slice[0].size();
-    int last_dim_start = npu_slice[0][rank - 1];
-    int last_dim_end = npu_slice[1][rank - 1];
-    int last_dim_step = npu_slice[2][rank - 1];
-    int min_end = last_dim_start + last_dim_step * min_last_dim_value_;
-    int raw_last_dim_len = (last_dim_end - last_dim_start) / last_dim_step;
-    return std::vector<int>({std::max(0, min_end - last_dim_end),
-                             min_last_dim_value_ - raw_last_dim_len});
-  }
-
-  inline void TileTensor(const framework::ExecutionContext* ctx,
-                         const Tensor* input, Tensor* output) const {
-    VLOG(4) << "start to tile tensor function, which calls the npu operator "
-               "TileWithAxis";
-    // UNSQUEEZE last dim + TILE last dim * min_last_dim_value_
-    Tensor reshape_tensor;
-    auto reshape_dims = framework::vectorize<int>(input->dims());
-    reshape_dims.push_back(1);
-    reshape_tensor.ShareDataWith(*input);
-    reshape_tensor.Resize(framework::make_ddim(reshape_dims));
-
-    auto output_dims = framework::vectorize<int>(input->dims());
-    output_dims.push_back(min_last_dim_value_);
-    output->mutable_data<T>(framework::make_ddim(output_dims), ctx->GetPlace());
-
-    framework::NPUAttributeMap attr;
-    attr["axis"] = static_cast<int>(reshape_dims.size() - 1);
-    attr["tiles"] = min_last_dim_value_;
-    auto stream =
-        ctx->template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner("TileWithAxis", {reshape_tensor}, {*output}, attr).Run(stream);
-  }
-
-  inline void BroadcastToD(const framework::ExecutionContext* ctx,
-                           const Tensor* input,
-                           const std::vector<int64_t>* shape,
-                           Tensor* output) const {
-    VLOG(4) << "Start BroadCast To";
-    auto new_shape = std::vector<int32_t>(shape->begin(), shape->end());
-    output->mutable_data<T>(framework::make_ddim(new_shape), ctx->GetPlace());
-    framework::NPUAttributeMap attr;
-    attr["shape"] = new_shape;
-    auto stream =
-        ctx->template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner("BroadcastToD", {*input}, {*output}, attr).Run(stream);
-  }
-
-  inline void CropTensor(const framework::ExecutionContext* ctx,
-                         const Tensor* input, Tensor* output) const {
-    auto out_dims = output->dims();
-    auto in_dims = input->dims();
-    int rank = in_dims.size();
-    in_dims[rank - 1] = 1;
-    output->Resize(in_dims);  // unsqueeze output -> [..., 1]
-    framework::NPUAttributeMap attr;
-    attr["axis"] = 0;
-    attr["offsets"] = std::vector<int>(rank, 0);
-    auto stream =
-        ctx->template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    NpuOpRunner("Crop", {*input, *output}, {*output}, attr).Run(stream);
-    output->Resize(out_dims);  // restore it
-  }
-
-  void SliceAssignNPU(const framework::ExecutionContext* ctx,
-                      const Tensor* value_tensor, Vector_Int64& start,
-                      Vector_Int64& end, Vector_Int64& steps,
-                      Vector_Int64& axes, Tensor* assigned_tensor) const {
-    // must ensure assigned_tensor and value_tensor have the same shape
-    // not support steps < 0
-    // output is also the assigned_tensor.
-    VLOG(4) << "start function SliceAssignND";
-    auto stream =
-        ctx->template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    for (size_t i = 0; i < steps.size(); ++i) {
-      PADDLE_ENFORCE_GT(steps[i], 0,
-                        platform::errors::InvalidArgument(
-                            "Currently NPU set_value operator doesn't support "
-                            "negative steps, but got %d as step",
-                            steps[i]));
-    }
-    std::vector<std::vector<int64_t>> npu_slice(3);
-    GetNPUStartEndSteps(start, end, steps, axes, assigned_tensor->dims(),
-                        npu_slice);
-    auto tile_numbers = MininumPadNumberMakeSureLastDimGT8(npu_slice);
-    int assigned_tensor_tile_number = tile_numbers[0];
-    int value_tensor_tile_number = tile_numbers[1];
 
-    VLOG(4) << "tile number is : " << assigned_tensor_tile_number << " "
-            << value_tensor_tile_number;
-
-    Tensor tiled_assigned_tns, tiled_value_tns;
-    if (assigned_tensor_tile_number > 0) {
-      TileTensor(ctx, assigned_tensor, &tiled_assigned_tns);
-      TileTensor(ctx, value_tensor, &tiled_value_tns);
-      // output have different shape, so use a tmp variable before_crop_output;
-      // add last dim = min_last_dim_value_ in slice
-      npu_slice[0].push_back(0);
-      npu_slice[1].push_back(min_last_dim_value_);
-      npu_slice[2].push_back(1);
-    }
-
-    framework::NPUAttributeMap attr_input;
-    attr_input["begin"] =
-        std::vector<int>(npu_slice[0].begin(), npu_slice[0].end());
-    attr_input["end"] =
-        std::vector<int>(npu_slice[1].begin(), npu_slice[1].end());
-    attr_input["strides"] =
-        std::vector<int>(npu_slice[2].begin(), npu_slice[2].end());
-    attr_input["begin_mask"] = 0;
-    attr_input["end_mask"] = 0;
-    attr_input["ellipsis_mask"] = 0;
-    attr_input["new_axis_mask"] = 0;
-    attr_input["shrink_axis_mask"] = 0;
-    if (assigned_tensor_tile_number > 0) {
-      NpuOpRunner("StridedSliceAssignD", {tiled_assigned_tns, tiled_value_tns},
-                  {tiled_assigned_tns}, attr_input)
-          .Run(stream);  // Remember, set output = input, and this op will
-                         // change the input value.
-    } else {
-      NpuOpRunner("StridedSliceAssignD", {*assigned_tensor, *value_tensor},
-                  {*assigned_tensor}, attr_input)
-          .Run(stream);
-    }
-    if (assigned_tensor_tile_number > 0) {
-      CropTensor(ctx, &tiled_assigned_tns /*initialzied*/,
-                 assigned_tensor /*initalized*/);
-    }
-  }
-
-  void ModifyAxesAccordingNoneAxes(const Vector_Int64& none_axes,
-                                   Vector_Int64& axes_to_modify) const {
-    if (none_axes.empty()) return;
-    auto none_axes_copy = none_axes;
-    sort(none_axes_copy.begin(), none_axes_copy.end());
-    for (size_t i = 0; i < axes_to_modify.size(); ++i) {
-      int axis = axes_to_modify[i];
-      auto upper =
-          upper_bound(none_axes_copy.begin(), none_axes_copy.end(), axis);
-      // Example: none_axes = [1,3,4,5,7]
-      //          axis = 4
-      //          find the element number less or equal than 4, which is
-      //          3(1,3,4)
-      //          axis becomes  4 + 3 = 7 ;
-      axes_to_modify[i] = axis + (upper - none_axes_copy.begin());
-    }
-  }
-
-  void UnsqueezeAccordingNoneAxes(const Vector_Int64& none_axes,
-                                  Vector_Int64& slice_dims) const {
-    // note : axes will change, because new axes inserted.
-    // sum array to modify the axes. because more simply
-    if (none_axes.empty()) return;
-    Vector_Int64 slice_dims_with_none;
-    size_t none_axes_cur = 0;
-    for (size_t i = 0; i < slice_dims.size(); ++i) {
-      while (none_axes_cur < none_axes.size() &&
-             none_axes[none_axes_cur] <= static_cast<int>(i)) {
-        slice_dims_with_none.push_back(1);
-        none_axes_cur++;
-      }
-      slice_dims_with_none.push_back(slice_dims[i]);
-    }
-    // if the none_axes.size() > slice_dims.size(), append 1 after last dim
-    while (none_axes_cur < none_axes.size()) {
-      slice_dims_with_none.push_back(1);
-      none_axes_cur++;
-    }
-    slice_dims = slice_dims_with_none;
-  }
+using NPUDeviceContext = platform::NPUDeviceContext;
 
-  void ModiftyDimsAccordingNoneAndDecrease(Vector_Int64& slice_dim,
-                                           Vector_Int64& value_dim,
-                                           Vector_Int64& axes,
-                                           Vector_Int64& none_axes,
-                                           Vector_Int64& dec_axes) const {
-    // change the value of slice_dim, value_dim, start, end, steps, axes by none
-    // and decrease axes
-    // after change, this values can be passed to SliceAssignNPU() directly.
-
-    // Modity Slice Dim
-    UnsqueezeAccordingNoneAxes(none_axes, slice_dim);
-    ModifyAxesAccordingNoneAxes(none_axes, dec_axes);
-    ModifyAxesAccordingNoneAxes(none_axes, axes);
-    // Modity Value Dim by new slice dim
-    auto slice_dim_reverse = slice_dim;
-    auto value_dim_reverse = value_dim;
-    std::reverse(slice_dim_reverse.begin(), slice_dim_reverse.end());
-    std::reverse(value_dim_reverse.begin(), value_dim_reverse.end());
-
-    Vector_Int64 new_value_dim;
-    PADDLE_ENFORCE_GE(
-        slice_dim.size(), value_dim.size(),
-        platform::errors::InvalidArgument("The size of expanded slice_dim(%d) "
-                                          "must greater than the value_dim(%d)",
-                                          slice_dim.size(), value_dim.size()));
+template <typename T>
+class SetValueNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<Tensor>("Input");
+    auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
+    auto* out = ctx.Output<Tensor>("Out");
 
-    size_t value_cur = 0;
-    size_t rank = slice_dim.size();
-    for (size_t i = 0; i < rank; ++i) {
-      auto& xsize = slice_dim_reverse[i];
-      if (value_cur >= value_dim_reverse.size()) {
-        new_value_dim.push_back(1);
-        continue;
-      }
-      auto& vsize = value_dim_reverse[value_cur];
-      auto it = find(dec_axes.begin(), dec_axes.end(), rank - 1 - i);
-      if (it != dec_axes.end()) {
-        // found, insert one dim ;
-        PADDLE_ENFORCE_EQ(xsize, 1, platform::errors::InvalidArgument(
-                                        "The dims refered by decrease axes is "
-                                        "not equal to 1, some wrongs happen"));
-        new_value_dim.push_back(1);
-        continue;
-      }
-      if (xsize == vsize || vsize == 1) {
-        new_value_dim.push_back(vsize);
-        ++value_cur;
-        continue;
-      }
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The shape of value_tensor can't be broadcast to value tensor, "
-          "please check input"));
-    }
-    for (; value_cur < value_dim_reverse.size(); ++value_cur) {
-      if (value_dim_reverse[value_cur] != 1) {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The shape of value_tensor can't be broadcast to value tensor, "
-            "please check input"));
-      }
-    }
-    std::reverse(new_value_dim.begin(), new_value_dim.end());
-    value_dim = new_value_dim;
-    return;
-  }
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    auto steps_tensor_list = ctx.MultiInput<Tensor>("StepsTensorList");
 
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    VLOG(2) << "Start Set Value Npu Kernel";
-    auto* in = ctx.Input<framework::LoDTensor>("Input");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-    auto* value_tensor = ctx.Input<framework::LoDTensor>("ValueTensor");
-    auto starts_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StartsTensorList");
-    auto ends_tensor_list = ctx.MultiInput<framework::Tensor>("EndsTensorList");
-    auto steps_tensor_list =
-        ctx.MultiInput<framework::Tensor>("StepsTensorList");
     auto axes = ctx.Attr<std::vector<int64_t>>("axes");
     auto starts = ctx.Attr<std::vector<int64_t>>("starts");
     auto ends = ctx.Attr<std::vector<int64_t>>("ends");
@@ -302,17 +39,6 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
     auto shape = ctx.Attr<std::vector<int64_t>>("shape");
     auto decrease_axes = ctx.Attr<std::vector<int64_t>>("decrease_axes");
     auto none_axes = ctx.Attr<std::vector<int64_t>>("none_axes");
-    auto dtype = in->type();
-
-    if (dtype == framework::proto::VarType::FP64 ||
-        dtype == framework::proto::VarType::INT64 ||
-        dtype == framework::proto::VarType::BOOL) {
-      auto value_type_name = GetValueName(dtype);
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The NPU setvalue kernel currently only support FLOAT32 and INT32, "
-          "but got type: %s",
-          value_type_name.data()));
-    }
 
     if (!starts_tensor_list.empty()) {
       starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
@@ -327,65 +53,137 @@ class SetValueNPUKernel : public framework::OpKernel<T> {
     auto in_dims = in->dims();
     CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps);
     auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps);
-    auto place = ctx.GetPlace();
+    auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes);
+
+    auto slice_dims_for_assign = decrease_slice_dims;
+    if (!none_axes.empty()) {
+      std::vector<int64_t> slice_dims_with_none;
+
+      size_t none_axes_cur = 0, decrease_axes_cur = 0;
+      for (int i = 0; i < slice_dims.size(); ++i) {
+        while (none_axes_cur < none_axes.size() &&
+               none_axes[none_axes_cur] <= i) {
+          slice_dims_with_none.push_back(1);
+          none_axes_cur++;
+        }
+        if (decrease_axes_cur < decrease_axes.size() &&
+            decrease_axes[decrease_axes_cur] == i) {
+          decrease_axes_cur++;
+        } else {
+          slice_dims_with_none.push_back(slice_dims[i]);
+        }
+      }
+      while (none_axes_cur < none_axes.size()) {
+        slice_dims_with_none.push_back(1);
+        none_axes_cur++;
+      }
 
-    // aforementioned code is copyed directly from CPU kernel.
-    // (@xiongkun03) the following is redesigned by xiongkun. because NPU can do
-    // step slice assignment. so we deal with all none_axes and decrease_axes
-    // here.
-    // 1. we insert 1 into assigned_tensor_shape according to none_axes;
-    // 2. we insert 1 into value_tensor_shape(value tensor) according to
-    // decrease_axes;
-    // 3. we reshape back the assigned_tensor. and return it.
-    // note : we use a tmp_value_tensor as value_tns. it shares data with
-    // value_tensor;
-    // I believe the logic is more simple than cpu logic.
+      slice_dims_for_assign = framework::make_ddim(slice_dims_with_none);
+    }
+
+    TensorCopy(*in, ctx.GetPlace(), out);
+
+    auto starts_indices = std::vector<int64_t>(in_dims.size(), 0);
+    auto ends_indices = std::vector<int64_t>(in_dims.size(), 0);
+    auto strides_indices = std::vector<int64_t>(in_dims.size(), 0);
+
+    for (int i = 0; i < in_dims.size(); ++i) {
+      starts_indices[i] = 0;
+      ends_indices[i] = slice_dims[i];
+      strides_indices[i] = 1;
+    }
+    for (size_t i = 0; i < axes.size(); i++) {
+      int axis_index = axes[i];
+      starts_indices[axis_index] = starts[i];
+      ends_indices[axis_index] = ends[i];
+      strides_indices[axis_index] = steps[i];
+    }
+
+    int64_t stride_step = framework::product(in_dims);
+    std::vector<int64_t> index_indices(1, 0);
+    for (size_t i = 0; i < strides_indices.size(); ++i) {
+      auto index_size = index_indices.size();
+      stride_step /= in_dims[i];
+      for (size_t j = 0; j < index_size; ++j) {
+        auto start_index = *index_indices.begin();
+        if (strides_indices[i] > 0) {
+          for (int64_t k = starts_indices[i]; k < ends_indices[i];
+               k += strides_indices[i]) {
+            index_indices.push_back(start_index + k * stride_step);
+          }
+        } else {
+          for (int64_t k = starts_indices[i]; k > ends_indices[i];
+               k += strides_indices[i]) {
+            index_indices.push_back(start_index + k * stride_step);
+          }
+        }
+        index_indices.erase(index_indices.begin());
+      }
+    }
 
-    TensorCopy(*in, place, out);
-    Tensor value_t(dtype);
+    PADDLE_ENFORCE_EQ(
+        static_cast<int64_t>(index_indices.size()),
+        framework::product(slice_dims_for_assign),
+        platform::errors::InvalidArgument(
+            "OP(set_value) error index indices and value update not match "));
 
-    if (value_tensor == nullptr) {
+    Tensor value_t(in->type());
+    if (value_tensor != nullptr) {
+      value_t.ShareDataWith(*value_tensor);
+    } else {
       auto value_dims = framework::make_ddim(shape);
-      value_t.mutable_data<T>(value_dims, place);
-      auto value_name = GetValueName(dtype);
+      CheckIsDimsMatch(slice_dims_for_assign, value_dims);
+
+      value_t.mutable_data<T>(value_dims, ctx.GetPlace());
+      auto value_name = GetValueName(in->type());
       CopyVecotorToTensor<T>(value_name.c_str(), &value_t, ctx);
       value_t.Resize(value_dims);
     }
 
-    const Tensor* value_tensor_ptr =
-        (value_tensor == nullptr) ? &value_t : value_tensor;
-    auto value_dims_vec = framework::vectorize(value_tensor_ptr->dims());
-    auto slice_dims_vec = framework::vectorize(slice_dims);
-    auto in_dims_vec = framework::vectorize(in_dims);
-
-    UnsqueezeAccordingNoneAxes(none_axes, in_dims_vec);
-    ModiftyDimsAccordingNoneAndDecrease(slice_dims_vec, value_dims_vec, axes,
-                                        none_axes,
-                                        decrease_axes);  // Modify and Check
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    Tensor reshaped_value_tensor, broadcast_value_tensor;
-    reshaped_value_tensor.ShareDataWith(*value_tensor_ptr);
-    reshaped_value_tensor.Resize(framework::make_ddim(value_dims_vec));
-
-    BroadcastToD(&ctx, &reshaped_value_tensor, &slice_dims_vec,
-                 &broadcast_value_tensor /*inner function initialized*/);
+    Tensor value_temp(in->type());
+    if (slice_dims_for_assign == value_t.dims()) {
+      value_temp.ShareDataWith(value_t);
+    } else {
+      value_temp.Resize(slice_dims_for_assign);
+      value_temp.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(value_t)
+          .AddInput(framework::vectorize(slice_dims_for_assign))
+          .AddOutput(value_temp)
+          .Run(stream);
+    }
 
-    out->Resize(framework::make_ddim(in_dims_vec));
-    SliceAssignNPU(&ctx, &broadcast_value_tensor, starts, ends, steps, axes,
-                   out);
-    out->Resize(in_dims);  // Reshape Back
+    int64_t input_numel = framework::product(in_dims);
+    int64_t index_numel = index_indices.size();
+
+    Tensor in_temp, out_temp, val_temp;
+    in_temp.ShareDataWith(*in);
+    out_temp.ShareDataWith(*out);
+    val_temp.ShareDataWith(value_temp);
+    in_temp.Resize(framework::make_ddim({input_numel}));
+    out_temp.Resize(framework::make_ddim({input_numel}));
+    val_temp.Resize(framework::make_ddim({index_numel}));
+
+    NpuOpRunner runner;
+    runner.SetType("ScatterUpdate")
+        .AddInput(in_temp)
+        .AddInput(std::move(index_indices))
+        .AddInput(val_temp)
+        .AddOutput(out_temp)
+        .Run(stream);
   }
-
- private:
-  const int min_last_dim_value_ =
-      32 / sizeof(T);  // 16 for float16 , 8 for float32
 };
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    set_value, ops::SetValueNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SetValueNPUKernel<paddle::platform::NPUDeviceContext, float>)
+
+REGISTER_OP_NPU_KERNEL(set_value, ops::SetValueNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::SetValueNPUKernel<int64_t>,
+#endif
+                       ops::SetValueNPUKernel<float>)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
index e819f422f2b44..421ea1df4cff0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py
@@ -23,13 +23,15 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-SEED = 2021
-
 
 class TestSetValueBase(unittest.TestCase):
-    def set_input(self):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def setUp(self):
+        paddle.enable_static()
         self.set_npu()
-        paddle.device.set_device('npu')
         self.set_dtype()
         self.set_value()
         self.set_shape()
@@ -51,9 +53,6 @@ def _call_setitem(self, x):
     def _get_answer(self):
         self.data[0, 0] = self.value
 
-    def set_npu(self):
-        self.__class__.use_npu = True
-
 
 class TestSetValueApi(TestSetValueBase):
     def _run_static(self):
@@ -62,13 +61,13 @@ def _run_static(self):
             x = paddle.ones(shape=self.shape, dtype=self.dtype)
             self._call_setitem(x)
 
-        exe = paddle.static.Executor(paddle.NPUPlace(0))
+        exe = paddle.static.Executor(self.place)
         out = exe.run(self.program, fetch_list=[x])
         paddle.disable_static()
         return out
 
     def _run_dynamic(self):
-        paddle.disable_static(paddle.NPUPlace(0))
+        paddle.disable_static(self.place)
         x = paddle.ones(shape=self.shape, dtype=self.dtype)
         self._call_setitem(x)
         out = x.numpy()
@@ -76,7 +75,6 @@ def _run_dynamic(self):
         return out
 
     def test_api(self):
-        self.set_input()
         static_out = self._run_static()
         dynamic_out = self._run_dynamic()
         self._get_answer()
@@ -134,23 +132,22 @@ def _get_answer(self):
         self.data[0:, 1:2, :] = self.value
 
 
-""" FIXEME : it seams that NPU don't support while operator  ???
-class TestSetValueItemSliceInWhile(TestSetValueApi):
-    def _call_setitem(self, x):
-        def cond(i, x):
-            return i < 1
+# TODO(qili93): Fix this after NPU support while_loop
+# class TestSetValueItemSliceInWhile(TestSetValueApi):
+#     def _call_setitem(self, x):
+#         def cond(i, x):
+#             return i < 1
 
-        def body(i, x):
-            x[i] = self.value
-            i = i + 1
-            return i, x
-        with paddle.static.device_guard("npu"):
-            i = paddle.zeros(shape=(1, ), dtype='int32')
-        i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+#         def body(i, x):
+#             x[i] = self.value
+#             i = i + 1
+#             return i, x
 
-    def _get_answer(self):
-        self.data[0] = self.value
-"""
+#         i = paddle.zeros(shape=(1, ), dtype='int32')
+#         i, x = paddle.fluid.layers.while_loop(cond, body, [i, x])
+
+#     def _get_answer(self):
+#         self.data[0] = self.value
 
 
 # 1.2.2 step > 1
@@ -192,6 +189,60 @@ def _get_answer(self):
         self.data[0:, 1:2:2, :] = self.value
 
 
+# 1.2.3 step < 0
+class TestSetValueItemSliceNegetiveStep(TestSetValueApi):
+    def set_shape(self):
+        self.shape = [5, 2]
+
+    def set_value(self):
+        self.value = np.array([3, 4])
+
+    def _call_setitem(self, x):
+        x[5:2:-1] = self.value
+
+    def _get_answer(self):
+        self.data[5:2:-1] = self.value
+
+
+class TestSetValueItemSliceNegetiveStep2(TestSetValueApi):
+    def set_shape(self):
+        self.shape = [5]
+
+    def set_value(self):
+        self.value = np.array([3, 4])
+
+    def _call_setitem(self, x):
+        x[1::-1] = self.value
+
+    def _get_answer(self):
+        self.data[1::-1] = self.value
+
+
+class TestSetValueItemSliceNegetiveStep3(TestSetValueApi):
+    def set_shape(self):
+        self.shape = [3]
+
+    def set_value(self):
+        self.value = np.array([3, 4, 5])
+
+    def _call_setitem(self, x):
+        x[::-1] = self.value
+
+    def _get_answer(self):
+        self.data[::-1] = self.value
+
+
+class TestSetValueItemSliceNegetiveStep4(TestSetValueApi):
+    def set_shape(self):
+        self.shape = [3, 4, 5]
+
+    def _call_setitem(self, x):
+        x[2:0:-1, 0:2, ::-1] = self.value
+
+    def _get_answer(self):
+        self.data[2:0:-1, 0:2, ::-1] = self.value
+
+
 # 1.3 item is Ellipsis
 
 
@@ -277,6 +328,19 @@ def _get_answer(self):
         self.data[0:, 1:2:2, :] = self.value
 
 
+class TestSetValueItemTensor6(TestSetValueApi):
+    def set_shape(self):
+        self.shape = [3, 4, 5]
+
+    def _call_setitem(self, x):
+        minus1 = paddle.full([1], -1, dtype="int32")
+        zero = paddle.full([1], 0, dtype="int32")
+        x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value
+
+    def _get_answer(self):
+        self.data[2:0:-1, 0:2, ::-1] = self.value
+
+
 # 1.5 item is None
 class TestSetValueItemNone1(TestSetValueApi):
     def _call_setitem(self, x):
@@ -350,133 +414,99 @@ def _get_answer(self):
         self.data[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None]
 
 
-""" FIXME : current NPU set_value don't support negative step !!!
-    @xiongkun03
+# 1.5 item is list or Tensor of bol
+class TestSetValueItemBool1(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[[True, False]] = self.value
 
-class TestSetValueItemTensor6(TestSetValueApi):
-    def set_shape(self):
-        self.shape = [3, 4, 5]
+    def _get_answer(self):
+        self.data[[True, False]] = self.value
 
+
+class TestSetValueItemBool2(TestSetValueApi):
     def _call_setitem(self, x):
-        minus1 = paddle.full([1], -1, dtype="int32")
-        zero = paddle.full([1], 0, dtype="int32")
-        x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value
+        x[[False, False]] = self.value
 
     def _get_answer(self):
-        self.data[2:0:-1, 0:2, ::-1] = self.value
-"""
+        self.data[[False, False]] = self.value
 
-# 2. Test different type of value: int, float, numpy.ndarray, Tensor
-# 2.1 value is int32, int64, float32, float64, bool
 
+class TestSetValueItemBool3(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[[False, True]] = np.zeros(self.shape[2])
 
-def create_test_value_int32(parent):
-    class TestValueInt(parent):
-        def set_value(self):
-            self.value = 7
+    def _get_answer(self):
+        self.data[[False, True]] = np.zeros(self.shape[2])
 
-        def set_dtype(self):
-            self.dtype = "int32"
-
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32")
-    TestValueInt.__name__ = cls_name
-    globals()[cls_name] = TestValueInt
 
+class TestSetValueItemBool4(TestSetValueApi):
+    def _call_setitem(self, x):
+        idx = paddle.assign(np.array([False, True]))
+        x[idx] = np.zeros(self.shape[2])
 
-create_test_value_int32(TestSetValueItemInt)
-create_test_value_int32(TestSetValueItemSlice)
-create_test_value_int32(TestSetValueItemSlice2)
-create_test_value_int32(TestSetValueItemSlice3)
-create_test_value_int32(TestSetValueItemSlice4)
+    def _get_answer(self):
+        self.data[np.array([False, True])] = np.zeros(self.shape[2])
 
 
-def create_test_value_numpy_fp32(parent):
-    class TestValueInt(parent):
-        def set_value(self):
-            self.value = np.array([1])
+class TestSetValueItemBool5(TestSetValueApi):
+    def _call_setitem(self, x):
+        idx = paddle.assign(
+            np.array([[False, True, False], [True, True, False]]))
+        x[idx] = self.value
 
-        def set_dtype(self):
-            self.dtype = "float32"
+    def _get_answer(self):
+        self.data[np.array([[False, True, False], [True, True, False]
+                            ])] = self.value
 
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp32")
-    TestValueInt.__name__ = cls_name
-    globals()[cls_name] = TestValueInt
 
+class TestSetValueItemBool6(TestSetValueApi):
+    def _call_setitem(self, x):
+        x[0, ...] = 0
+        x[x > 0] = self.value
 
-create_test_value_numpy_fp32(TestSetValueItemInt)
-create_test_value_numpy_fp32(TestSetValueItemSlice)
-create_test_value_numpy_fp32(TestSetValueItemSlice2)
-create_test_value_numpy_fp32(TestSetValueItemSlice3)
-create_test_value_numpy_fp32(TestSetValueItemSlice4)
+    def _get_answer(self):
+        self.data[0, ...] = 0
+        self.data[self.data > 0] = self.value
 
 
-def create_test_value_numpy_fp64(parent):
+def create_test_value_int32(parent):
     class TestValueInt(parent):
         def set_value(self):
-            self.value = np.array([2**127]).astype("float64")
-
-        def set_dtype(self):
-            self.dtype = "float64"
-
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp64")
-    TestValueInt.__name__ = cls_name
-    globals()[cls_name] = TestValueInt
-
-
-create_test_value_numpy_fp64(TestSetValueItemInt)
-create_test_value_numpy_fp64(TestSetValueItemSlice)
-create_test_value_numpy_fp64(TestSetValueItemSlice2)
-create_test_value_numpy_fp64(TestSetValueItemSlice3)
-create_test_value_numpy_fp64(TestSetValueItemSlice4)
-
+            self.value = 7
 
-# 2.3 value is a Paddle Tensor (int32, int64, float32, float64, bool)
-def create_test_value_tensor_int32(parent):
-    class TestValueInt(parent):
         def set_dtype(self):
             self.dtype = "int32"
 
-        def _call_setitem(self, x):
-            value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype)
-            x[0, 1] = value
-
-        def _get_answer(self):
-            self.data[0, 1] = 3
-
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt32")
+    cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32")
     TestValueInt.__name__ = cls_name
     globals()[cls_name] = TestValueInt
 
 
-create_test_value_tensor_int32(TestSetValueItemInt)
-create_test_value_tensor_int32(TestSetValueItemSlice)
-create_test_value_tensor_int32(TestSetValueItemSlice2)
-create_test_value_tensor_int32(TestSetValueItemSlice3)
-create_test_value_tensor_int32(TestSetValueItemSlice4)
+create_test_value_int32(TestSetValueItemInt)
+create_test_value_int32(TestSetValueItemSlice)
+create_test_value_int32(TestSetValueItemSlice2)
+create_test_value_int32(TestSetValueItemSlice3)
+create_test_value_int32(TestSetValueItemSlice4)
 
 
-def create_test_value_tensor_int64(parent):
+def create_test_value_int64(parent):
     class TestValueInt(parent):
+        def set_value(self):
+            self.value = 7
+
         def set_dtype(self):
             self.dtype = "int64"
 
-        def _call_setitem(self, x):
-            value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype)
-            x[0, 1] = value
-
-        def _get_answer(self):
-            self.data[0, 1] = 3
-
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt64")
+    cls_name = "{0}_{1}".format(parent.__name__, "ValueInt64")
     TestValueInt.__name__ = cls_name
     globals()[cls_name] = TestValueInt
 
 
-create_test_value_tensor_int64(TestSetValueItemInt)
-create_test_value_tensor_int64(TestSetValueItemSlice)
-create_test_value_tensor_int64(TestSetValueItemSlice2)
-create_test_value_tensor_int64(TestSetValueItemSlice3)
-create_test_value_tensor_int64(TestSetValueItemSlice4)
+create_test_value_int64(TestSetValueItemInt)
+create_test_value_int64(TestSetValueItemSlice)
+create_test_value_int64(TestSetValueItemSlice2)
+create_test_value_int64(TestSetValueItemSlice3)
+create_test_value_int64(TestSetValueItemSlice4)
 
 
 def create_test_value_tensor_fp32(parent):
@@ -503,30 +533,6 @@ def _get_answer(self):
 create_test_value_tensor_fp32(TestSetValueItemSlice4)
 
 
-def create_test_value_tensor_fp64(parent):
-    class TestValueInt(parent):
-        def set_dtype(self):
-            self.dtype = "float64"
-
-        def _call_setitem(self, x):
-            value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype)
-            x[0, 1] = value
-
-        def _get_answer(self):
-            self.data[0, 1] = 3
-
-    cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorFp64")
-    TestValueInt.__name__ = cls_name
-    globals()[cls_name] = TestValueInt
-
-
-create_test_value_tensor_fp64(TestSetValueItemInt)
-create_test_value_tensor_fp64(TestSetValueItemSlice)
-create_test_value_tensor_fp64(TestSetValueItemSlice2)
-create_test_value_tensor_fp64(TestSetValueItemSlice3)
-create_test_value_tensor_fp64(TestSetValueItemSlice4)
-
-
 # 3. Test different shape of value
 class TestSetValueValueShape1(TestSetValueApi):
     def set_value(self):
@@ -589,59 +595,5 @@ def _get_answer(self):
         self.data[:, 0] = self.value
 
 
-# 4. Test error
-class TestError(TestSetValueBase):
-    def _value_type_error(self):
-        with self.assertRaisesRegexp(
-                TypeError,
-                "Only support to assign an integer, float, numpy.ndarray or paddle.Tensor"
-        ):
-            x = paddle.ones(shape=self.shape, dtype=self.dtype)
-            value = [1]
-            x[0] = value
-
-    def _dtype_error(self):
-        with self.assertRaisesRegexp(
-                TypeError,
-                "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
-        ):
-            y = paddle.ones(shape=self.shape, dtype="float16")
-            y[0] = 1
-
-    def _step_error(self):
-        with self.assertRaisesRegexp(ValueError, "step can not be 0"):
-            x = paddle.ones(shape=self.shape, dtype=self.dtype)
-            x[0:1:0] = self.value
-
-    def _ellipsis_error(self):
-        with self.assertRaisesRegexp(
-                IndexError, "An index can only have a single ellipsis"):
-            x = paddle.ones(shape=self.shape, dtype=self.dtype)
-            x[..., ...] = self.value
-        with self.assertRaisesRegexp(ValueError, "the start or end is None"):
-            x = paddle.ones(shape=self.shape, dtype=self.dtype)
-            one = paddle.ones([1])
-            x[::one] = self.value
-
-    def _broadcast_mismatch(self):
-        program = paddle.static.Program()
-        with paddle.static.program_guard(program):
-            x = paddle.ones(shape=self.shape, dtype=self.dtype)
-            value = np.array([3, 4, 5, 6, 7])
-            x[0] = value
-        exe = paddle.static.Executor(paddle.CPUPlace())
-        with self.assertRaises(ValueError):
-            exe.run(program)
-
-    def test_error(self):
-        self.set_input()
-        paddle.enable_static()
-        with paddle.static.program_guard(self.program):
-            self._value_type_error()
-            self._dtype_error()
-            self._step_error()
-        self._broadcast_mismatch()
-
-
 if __name__ == '__main__':
     unittest.main()

From 7850f7ce0ac70cb52dd071579aea64cdd235efd5 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Mon, 11 Oct 2021 14:12:56 +0800
Subject: [PATCH 48/80] [NPU] fix matmul_v2 and utils.run_check, test=develop
 (#36164)

* [NPU] fix matmul_v2 and utils.run_check, test=develop

* remove debug files, test=develop

* fix install_check, test=develop

* fix doc, test=develop

* fix review comments, test=develop
---
 paddle/fluid/operators/matmul_v2_op_npu.cc    | 477 ++++++++++++-----
 python/paddle/fluid/framework.py              |  70 +++
 .../fluid/tests/unittests/npu/CMakeLists.txt  |   1 +
 .../unittests/npu/test_matmulv2_op_npu.py     | 504 +++++++++++-------
 python/paddle/static/__init__.py              |   2 +
 python/paddle/utils/install_check.py          |  58 +-
 6 files changed, 768 insertions(+), 344 deletions(-)

diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index b23b408e9c59a..6d7e8f3478c84 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -21,166 +21,387 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void MatMul2D(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y) {
+  Out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner =
+      NpuOpRunner("MatMul", {X, Y}, {*Out},
+                  {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}});
+  runner.Run(stream);
+}
+
+template <typename T>
+static void MatMulND(const framework::ExecutionContext& ctx,
+                     const aclrtStream& stream, const Tensor& X,
+                     const Tensor& Y, Tensor* Out, const bool trans_x,
+                     const bool trans_y) {
+  Out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out},
+                                   {{"adj_x1", trans_x}, {"adj_x2", trans_y}});
+  runner.Run(stream);
+}
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream,
+                       const std::vector<int64_t>& dims,
+                       const std::vector<int64_t>& brd_dims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t size = brd_dims.size();
+  int64_t diff = brd_dims.size() - dims.size();
+  for (int64_t i = 0; i < size; ++i) {
+    if (i < diff) {
+      axes.push_back(i);
+      continue;
+    }
+    if (brd_dims[i] > dims[i - diff]) {
+      axes.push_back(i);
+    }
+  }
+  out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
+                                   {{"axes", axes}, {"keep_dims", false}});
+  runner.Run(stream);
+}
+
+template <typename T>
 class MatMulV2NPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    bool transpose_x = ctx.Attr<bool>("trans_x");
-    bool transpose_y = ctx.Attr<bool>("trans_y");
-
-    if (x->dims().size() == 2) {
-      out->mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner = NpuOpRunner(
-          "MatMul", {*x, *y}, {*out},
-          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
-
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
-      runner.Run(stream);
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* Out = ctx.Output<Tensor>("Out");
+    const bool trans_x = ctx.Attr<bool>("trans_x");
+    const bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(Out->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-    } else if (x->dims().size() > 2) {
-      out->mutable_data<T>(ctx.GetPlace());
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-      const auto& runner =
-          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
-                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      PADDLE_ENFORCE_EQ(
+          X->numel(), Y->numel(),
+          platform::errors::InvalidArgument(
+              "X's numbers must be equal to Y's numbers,"
+              "when X/Y's dims =1. But received X has [%d] elements,"
+              "received Y has [%d] elements",
+              X->numel(), Y->numel()));
+      Out->Resize({1});
+      Out->mutable_data<T>(ctx.GetPlace());
 
-      auto stream =
-          ctx.template device_context<paddle::platform::NPUDeviceContext>()
-              .stream();
+      const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out});
       runner.Run(stream);
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    if (trans_y) {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1]));
+    } else {
+      PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K,
+                        platform::errors::InvalidArgument(
+                            "Input(Y) has error dim."
+                            "Y'dims[%d] must be equal to %d"
+                            "But received Y'dims[%d] is %d",
+                            y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2]));
     }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (x_ndim == 2 && y_ndim == 2) {
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
+      return;
+    }
+
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (trans_x == false && y_ndim == 2) {
+      std::vector<int64_t> vec_dim = {x_temp.numel() / K, K};
+      x_temp.Resize(framework::make_ddim(vec_dim));
+      MatMul2D<T>(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y);
+      return;
+    }
+
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
+
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+    MatMulND<T>(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y);
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* y = ctx.Input<framework::Tensor>("Y");
-    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
-    bool transpose_y = ctx.Attr<bool>("trans_y");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-
-    if (x->dims().size() == 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", false}});
-
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    auto* X = ctx.Input<Tensor>("X");
+    auto* Y = ctx.Input<Tensor>("Y");
+    auto* dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dY = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    const bool trans_x = ctx.Attr<bool>("trans_x");
+    const bool trans_y = ctx.Attr<bool>("trans_y");
 
-          runner_dy.Run(stream);
-        }
+    std::vector<int64_t> x_dims = framework::vectorize(X->dims());
+    std::vector<int64_t> y_dims = framework::vectorize(Y->dims());
+    std::vector<int64_t> out_dims = framework::vectorize(dOut->dims());
+    int x_ndim = x_dims.size();
+    int y_ndim = y_dims.size();
+    int out_ndim = out_dims.size();
 
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
-                          {{"transpose_x1", false}, {"transpose_x2", true}});
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-          runner_dx.Run(stream);
-        }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
-                          {{"transpose_x1", true}, {"transpose_x2", false}});
+    // Case 1: [K] x [K] = [1]
+    if (x_ndim == 1 && y_ndim == 1) {
+      Tensor dout_temp(dOut->type());
+      dout_temp.Resize(X->dims());
+      dout_temp.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner;
+      runner.SetType("BroadcastTo")
+          .AddInput(*dOut)
+          .AddInput(std::move(x_dims))
+          .AddOutput(dout_temp)
+          .Run(stream);
 
-          runner_dy.Run(stream);
+      if (dX) {
+        dX->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {});
+        runner_dx.Run(stream);
+      }
+      if (dY) {
+        dY->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {});
+        runner_dy.Run(stream);
+      }
+      return;
+    }
+
+    // Resize dim 1 to 2
+    Tensor x_temp, y_temp, dout_temp;
+    x_temp.ShareDataWith(*X);
+    y_temp.ShareDataWith(*Y);
+    dout_temp.ShareDataWith(*dOut);
+    if (x_ndim == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+      out_dims.insert(out_dims.end() - 1, 1);
+      x_temp.Resize(framework::make_ddim(x_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      x_ndim = 2;
+      out_ndim += 1;
+    }
+    if (y_ndim == 1) {
+      y_dims.push_back(1);
+      out_dims.push_back(1);
+      y_temp.Resize(framework::make_ddim(y_dims));
+      dout_temp.Resize(framework::make_ddim(out_dims));
+      y_ndim = 2;
+      out_ndim += 1;
+    }
+
+    // Case 2: [M, K] x [K, N] = [M, N]
+    if (out_ndim == 2) {
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_dims));
+        if (trans_x) {
+          MatMul2D<T>(ctx, stream, y_temp, dout_temp, dX, trans_y, true);
+        } else {
+          MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
         }
+        dX->Resize(X->dims());
       }
-    } else if (x->dims().size() > 2) {
-      if (transpose_y) {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", false}});
-
-          runner_dx.Run(stream);
+      if (dY) {
+        dY->Resize(framework::make_ddim(y_dims));
+        if (trans_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, trans_x);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, !trans_x, false);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dy =
-              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
-                          {{"adj_x1", true}, {"adj_x2", false}});
+        dY->Resize(Y->dims());
+      }
+      return;
+    }
+
+    const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
+    const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
 
-          runner_dy.Run(stream);
+    // Case 3: [B, M, K] x [K, N] =  [B, M, N], when trans_x = false
+    // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N]
+    if (trans_x == false && y_ndim == 2) {
+      std::vector<int64_t> x_vec_dim = {x_temp.numel() / K, K};
+      dout_temp.Resize(
+          framework::make_ddim(std::vector<int64_t>{dout_temp.numel() / N, N}));
+      if (dX) {
+        dX->Resize(framework::make_ddim(x_vec_dim));
+        MatMul2D<T>(ctx, stream, dout_temp, y_temp, dX, false, !trans_y);
+        dX->Resize(X->dims());
+      }
+      if (dY) {
+        x_temp.Resize(framework::make_ddim(x_vec_dim));
+        if (trans_y) {
+          MatMul2D<T>(ctx, stream, dout_temp, x_temp, dY, true, false);
+        } else {
+          MatMul2D<T>(ctx, stream, x_temp, dout_temp, dY, true, false);
         }
-      } else {
-        if (dx) {
-          dx->mutable_data<T>(ctx.GetPlace());
-          const auto& runner_dx =
-              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                          {{"adj_x1", false}, {"adj_x2", true}});
+      }
+      return;
+    }
+
+    // Case 4: [B, M, K] x  [B, K, N] = [B, M, N]
+    std::vector<int64_t> x_broadcast_dims(out_ndim, 1);
+    std::vector<int64_t> y_broadcast_dims(out_ndim, 1);
+    std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin());
+    std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin());
+    std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2);
+    std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2);
+
+    Tensor x_temp_brd(X->type());
+    if (x_dims == x_broadcast_dims) {
+      x_temp_brd.ShareDataWith(*X);
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+    } else {
+      x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims));
+      x_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(x_temp)
+          .AddInput(std::move(x_broadcast_dims))
+          .AddOutput(x_temp_brd)
+          .Run(stream);
+    }
 
-          runner_dx.Run(stream);
+    Tensor y_temp_brd(Y->type());
+    if (y_dims == y_broadcast_dims) {
+      y_temp_brd.ShareDataWith(*Y);
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+    } else {
+      y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims));
+      y_temp_brd.mutable_data<T>(ctx.GetPlace());
+      NpuOpRunner runner_brd;
+      runner_brd.SetType("BroadcastTo")
+          .AddInput(y_temp)
+          .AddInput(std::move(y_broadcast_dims))
+          .AddOutput(y_temp_brd)
+          .Run(stream);
+    }
+
+    if (dX) {
+      if (x_dims == x_broadcast_dims) {
+        if (trans_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y);
         }
-        if (dy) {
-          dy->mutable_data<T>(ctx.GetPlace());
-          if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
-              (dy->dims().size() == 2)) {
-            framework::Tensor dout_tmp;
-            dout_tmp.ShareDataWith(*dout);
-            std::vector<int> vec_dim =
-                framework::vectorize<int>(dout_tmp.dims());
-            std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
-            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
-
-            framework::Tensor x_tmp;
-            x_tmp.ShareDataWith(*x);
-            std::vector<int> vec_dim_x =
-                framework::vectorize<int>(x_tmp.dims());
-            std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
-                                         vec_dim_x[2]};
-            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
-            const auto& runner_dy =
-                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
-                            {{"transpose_x1", true}, {"transpose_x2", false}});
-            runner_dy.Run(stream);
-          } else {
-            const auto& runner_dy =
-                NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
-                            {{"adj_x1", true}, {"adj_x2", false}});
-            runner_dy.Run(stream);
-          }
+      } else {
+        Tensor dx_temp(X->type());
+        dx_temp.Resize(framework::make_ddim(x_broadcast_dims));
+        if (trans_x) {
+          MatMulND<T>(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y,
+                      true);
+        } else {
+          MatMulND<T>(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false,
+                      !trans_y);
         }
+        ReduceDims<T>(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX);
+      }
+    }
+    if (dY) {
+      if (y_dims == y_broadcast_dims) {
+        if (trans_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false);
+        }
+      } else {
+        Tensor dy_temp(Y->type());
+        dy_temp.Resize(framework::make_ddim(y_broadcast_dims));
+        if (trans_y) {
+          MatMulND<T>(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true,
+                      trans_x);
+        } else {
+          MatMulND<T>(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x,
+                      false);
+        }
+        ReduceDims<T>(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY);
       }
     }
   }
 };
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_NPU_KERNEL(
-    matmul_v2,
-    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulV2NPUKernel<paddle::platform::NPUDeviceContext,
-                           paddle::platform::float16>);
-REGISTER_OP_NPU_KERNEL(
-    matmul_v2_grad,
-    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MatMulV2GradNPUKernel<paddle::platform::NPUDeviceContext,
-                               paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel<float>,
+                       ops::MatMulV2NPUKernel<paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel<float>,
+                       ops::MatMulV2GradNPUKernel<paddle::platform::float16>);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7f2937b9af764..4d90b9159470e 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -55,6 +55,7 @@
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
     'is_compiled_with_xpu',
+    'is_compiled_with_npu',
     'Variable',
     'require_version',
     'device_guard',
@@ -380,6 +381,15 @@ def _xpu_ids():
     return device_ids
 
 
+def _npu_ids():
+    npus_env = os.getenv("FLAGS_selected_npus")
+    if npus_env:
+        device_ids = [int(s) for s in npus_env.split(",")]
+    else:
+        device_ids = six.moves.range(core.get_npu_device_count())
+    return device_ids
+
+
 def is_compiled_with_xpu():
     """
     Whether this whl package can be used to run the model on XPU.
@@ -395,6 +405,21 @@ def is_compiled_with_xpu():
     return core.is_compiled_with_xpu()
 
 
+def is_compiled_with_npu():
+    """
+    Whether this whl package can be used to run the model on NPU.
+
+    Returns (bool): support npu or not.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+            support_npu = fluid.is_compiled_with_npu()
+    """
+    return core.is_compiled_with_npu()
+
+
 def disable_signal_handler():
     """
     Reset signal handler registered by Paddle.
@@ -538,6 +563,47 @@ def xpu_places(device_ids=None):
     return [core.XPUPlace(dev_id) for dev_id in device_ids]
 
 
+def npu_places(device_ids=None):
+    """
+    **Note**:
+        For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
+    
+    This function creates a list of :code:`paddle.NPUPlace` objects.
+    If :code:`device_ids` is None, environment variable of
+    :code:`FLAGS_selected_npus` would be checked first. For example, if
+    :code:`FLAGS_selected_npus=0,1,2`, the returned list would
+    be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
+    If :code:`FLAGS_selected_npus` is not set, all visible
+    npu places would be returned.
+    If :code:`device_ids` is not None, it should be the device
+    ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
+    the returned list would be 
+    [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
+    
+    Parameters:
+        device_ids (list or tuple of int, optional): list of NPU device ids.
+    Returns:
+        list of paddle.NPUPlace: Created NPU place list.
+    Examples:
+        .. code-block:: python
+
+            # required: npu
+
+            import paddle
+            import paddle.static as static
+            
+            paddle.enable_static()
+            npu_places = static.npu_places()
+    """
+    assert core.is_compiled_with_npu(), \
+        "Not compiled with NPU"
+    if device_ids is None:
+        device_ids = _npu_ids()
+    elif not isinstance(device_ids, (list, tuple)):
+        device_ids = [device_ids]
+    return [core.NPUPlace(dev_id) for dev_id in device_ids]
+
+
 def cpu_places(device_count=None):
     """
     This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
@@ -1927,6 +1993,10 @@ def set_value(self, value, scope=None):
             p = core.Place()
             p.set_place(t._place())
             place = core.XPUPlace(p.xpu_device_id())
+        elif p.is_npu_place():
+            p = core.Place()
+            p.set_place(t._place())
+            place = core.NPUPlace(p.npu_device_id())
         else:
             p = core.Place()
             p.set_place(t._place())
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 44b3c6764a7cf..4e81bb9544ceb 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -20,4 +20,5 @@ if (WITH_ASCEND_CL)
     set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
+    set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
index 53766c5eb61b7..882043ef6eb91 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -21,56 +21,35 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+from test_matmul_v2_op import reference_matmul
 
 paddle.enable_static()
 SEED = 2021
 
 
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if not Out.shape:
-        # We do not support 0-dimensional Tensors (scalars). So where
-        # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1) instead.
-        # Everywhere else, we are compatible with np.matmul.
-        Out = np.array([Out], dtype="float64")
-    return Out
-
-
-class TestMatMul(OpTest):
+class TestMatMulV2Op(OpTest):
+    """
+    case 1
+    """
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
     def config(self):
-        self.x_shape = (100, 24)
-        self.y_shape = (24, 100)
+        self.x_shape = (100, )
+        self.y_shape = (100, )
         self.trans_x = False
         self.trans_y = False
 
+    def init_kernel_type(self):
+        self.dtype = "float32"
+
     def setUp(self):
         self.set_npu()
-        self.op_type = "matmul_v2"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
+        self.init_kernel_type()
         self.config()
-        np.random.seed(SEED)
+        self.op_type = "matmul_v2"
         x = np.random.random(self.x_shape).astype(self.dtype)
         y = np.random.random(self.y_shape).astype(self.dtype)
         # -0.1 ~ 0.1
@@ -85,201 +64,314 @@ def setUp(self):
         self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
         self.outputs = {'Out': result}
 
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
     def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
 
-    # TODO(ascendrc): Add grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
-class TestMatMul2(TestMatMul):
+class TestMatMuklOp2(TestMatMulV2Op):
     """
     case 2
     """
 
     def config(self):
-        self.x_shape = (32, 24)
-        self.y_shape = (32, 24)
+        self.x_shape = (100, )
+        self.y_shape = (1, 3, 2, 100)
         self.trans_x = False
         self.trans_y = True
 
 
-class TestMatMul3(TestMatMul):
+class TestMatMuklOp3(TestMatMulV2Op):
     """
     case 3
     """
 
-    def init_dtype(self):
-        self.dtype = np.float16
+    def config(self):
+        self.x_shape = (100, )
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
 
 
-class TestMatMul4(TestMatMul):
+class TestMatMuklOp4(TestMatMulV2Op):
     """
-    case 4 dim=3
+    case 4
     """
 
     def config(self):
-        self.x_shape = (2, 3, 4)
-        self.y_shape = (2, 4, 3)
+        self.x_shape = (100, )
+        self.y_shape = (1, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
 
 
-class TestMatMulNet(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        a_np = np.random.random(size=(2, 3)).astype('float32')
-        b_np = np.random.random(size=(2, 3)).astype('float32')
-        c_np = np.random.random(size=(3, 2)).astype('float32')
-        d_np = np.random.random(size=(3, 2)).astype('float32')
-        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
-
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[2, 3], dtype='float32')
-            b = paddle.static.data(name="b", shape=[2, 3], dtype='float32')
-            c = paddle.static.data(name="c", shape=[3, 2], dtype='float32')
-            d = paddle.static.data(name="d", shape=[3, 2], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
-
-            sum_1 = paddle.add(a, b)
-            sum_2 = paddle.add(c, d)
-            result = paddle.matmul(sum_1, sum_2)
-
-            fc_1 = fluid.layers.fc(input=result, size=8)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-
-        if run_npu:
-            place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "c": c_np,
-                                             "d": d_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-
-        return pred_res, loss_res
-
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
-
-
-# The precision is aligned in NPU and GPU separately, which is only used for the usage method.
-
-
-class TestMatMulNet3_2(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-        self._dtype = "float32"
-
-        a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype)
-        b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype)
-        c_np = np.random.random(size=(3, 2)).astype(self._dtype)
-        d_np = np.random.random(size=(3, 2)).astype(self._dtype)
-        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
-
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype)
-            b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype)
-            c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype)
-            d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype)
-            label = paddle.static.data(
-                name="label", shape=[2, 1], dtype='int64')
-
-            sum_1 = paddle.add(a, b)
-            sum_2 = paddle.add(c, d)
-            sum_1 = paddle.cast(sum_1, 'float16')
-            sum_2 = paddle.cast(sum_2, 'float16')
-            if not run_npu:
-                sum_1 = paddle.cast(sum_1, 'float32')
-                sum_2 = paddle.cast(sum_2, 'float32')
-
-            result = paddle.matmul(sum_1, sum_2)
-            if run_npu:
-                result = paddle.cast(result, 'float32')
-
-            result = paddle.reshape(result, shape=[2, 2])
-            fc_1 = fluid.layers.fc(input=result, size=8)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-
-        if run_npu:
+class TestMatMuklOp5(TestMatMulV2Op):
+    """
+    case 5
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 100, 1)
+        self.y_shape = (100, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp6(TestMatMulV2Op):
+    """
+    case 6
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 102, 1)
+        self.y_shape = (102, )
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp7(TestMatMulV2Op):
+    """
+    case 7
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 1, 100)
+        self.y_shape = (100, )
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp8(TestMatMulV2Op):
+    """
+    case 8
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp9(TestMatMulV2Op):
+    """
+    case 9
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 1, 100)
+        self.y_shape = (2, 1, 2, 100)
+        self.trans_x = False
+        self.trans_y = True
+
+
+class TestMatMuklOp10(TestMatMulV2Op):
+    """
+    case 10
+    """
+
+    def config(self):
+        self.x_shape = (1, 1, 25, 4)
+        self.y_shape = (1, 2, 4, 25)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp11(TestMatMulV2Op):
+    """
+    case 11
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 2, 100)
+        self.y_shape = (1, 1, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp12(TestMatMulV2Op):
+    """
+    case 12
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 4, 25)
+        self.y_shape = (1, 1, 4, 25)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp13(TestMatMulV2Op):
+    """
+    case 13
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 10, 10)
+        self.y_shape = (2, 2, 10, 10)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp14(TestMatMulV2Op):
+    """
+    case 14_1
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 6, 6)
+        self.y_shape = (1, 2, 6, 9)
+        self.trans_x = True
+        self.trans_y = False
+
+
+class TestMatMuklOp15(TestMatMulV2Op):
+    """
+    case 14_2
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 6, 6)
+        self.y_shape = (1, 2, 6, 9)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp16(TestMatMulV2Op):
+    """
+    case 16 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (100)
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOp17(TestMatMulV2Op):
+    """
+    case 17 : to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = (100)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+    """
+    case 14_3
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 10, 10)
+        self.y_shape = (1, 2, 10, 10)
+        self.trans_x = True
+        self.trans_y = True
+
+
+class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+    """
+    case 14_4
+    """
+
+    def config(self):
+        self.x_shape = (3, 1, 10, 10)
+        self.y_shape = (1, 2, 10, 10)
+        self.trans_x = False
+        self.trans_y = True
+
+
+#--------------------test matmul fp16--------------------
+
+
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ['X', 'Y'],
+                'Out',
+                max_relative_error=max_relative_error)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulV2Op)
+create_test_fp16_class(TestMatMuklOp2)
+create_test_fp16_class(TestMatMuklOp3)
+create_test_fp16_class(TestMatMuklOp4)
+create_test_fp16_class(TestMatMuklOp5)
+create_test_fp16_class(TestMatMuklOp6)
+create_test_fp16_class(TestMatMuklOp7)
+create_test_fp16_class(TestMatMuklOp8)
+create_test_fp16_class(TestMatMuklOp9)
+create_test_fp16_class(TestMatMuklOp10)
+create_test_fp16_class(TestMatMuklOp11)
+create_test_fp16_class(TestMatMuklOp12)
+create_test_fp16_class(TestMatMuklOp13)
+create_test_fp16_class(TestMatMuklOp14)
+create_test_fp16_class(TestMatMuklOp15)
+create_test_fp16_class(TestMatMuklOp16)
+create_test_fp16_class(TestMatMuklOp17)
+
+
+class TestMatMulV2API(unittest.TestCase):
+    def setUp(self):
+        self.places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_npu():
+            self.places.append(paddle.NPUPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32")
+            input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32")
+
+            result = paddle.matmul(input_x, input_y)
+
+            x_np = np.random.random([4, 3]).astype("float32")
+            y_np = np.random.random([3, 4]).astype("float32")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input_x": x_np,
+                                    "input_y": y_np},
+                              fetch_list=[result])
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float32")
+                input_y = np.random.random([3, 4]).astype("float32")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
+
+    def test_dygraph_fp16(self):
+        if paddle.is_compiled_with_npu():
             place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "c": c_np,
-                                             "d": d_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-
-        return pred_res, loss_res
-
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-
-        self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4))
+            with fluid.dygraph.guard(place):
+                input_x = np.random.random([4, 3]).astype("float16")
+                input_y = np.random.random([3, 4]).astype("float16")
+                x = paddle.to_tensor(input_x)
+                y = paddle.to_tensor(input_y)
+                result = paddle.matmul(x, y)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py
index 0f463b0c7d941..20af4158df48f 100644
--- a/python/paddle/static/__init__.py
+++ b/python/paddle/static/__init__.py
@@ -43,6 +43,7 @@
 from ..fluid.framework import cpu_places  # noqa: F401
 from ..fluid.framework import cuda_places  # noqa: F401
 from ..fluid.framework import xpu_places  # noqa: F401
+from ..fluid.framework import npu_places  # noqa: F401
 from ..fluid.framework import Variable  # noqa: F401
 from ..fluid.layers.control_flow import Print  # noqa: F401
 from ..fluid.layers.nn import py_func  # noqa: F401
@@ -99,6 +100,7 @@
            'cpu_places',
            'cuda_places',
            'xpu_places',
+           'npu_places',
            'Variable',
            'create_global_var',
            'accuracy',
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 69baa4facfa96..efdc6847f0056 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -74,7 +74,22 @@ def _is_cuda_available():
         return False
 
 
-def _run_dygraph_single(use_cuda):
+def _is_npu_available():
+    """
+    Check whether NPU is avaiable.
+    """
+    try:
+        assert len(paddle.static.npu_places()) > 0
+        return True
+    except Exception as e:
+        logging.warning(
+            "You are using NPU version PaddlePaddle, but there is no NPU "
+            "detected on your machine. Maybe NPU devices is not set properly."
+            "\n Original Error is {}".format(e))
+        return False
+
+
+def _run_dygraph_single(use_cuda, use_npu):
     """
     Testing the simple network in dygraph mode using one CPU/GPU.
 
@@ -84,6 +99,8 @@ def _run_dygraph_single(use_cuda):
     paddle.disable_static()
     if use_cuda:
         paddle.set_device('gpu')
+    elif use_npu:
+        paddle.set_device('npu')
     else:
         paddle.set_device('cpu')
     weight_attr = paddle.ParamAttr(
@@ -102,7 +119,7 @@ def _run_dygraph_single(use_cuda):
     opt.step()
 
 
-def _run_static_single(use_cuda):
+def _run_static_single(use_cuda, use_npu):
     """
     Testing the simple network with executor running directly, using one CPU/GPU.
 
@@ -119,8 +136,14 @@ def _run_static_single(use_cuda):
             param_grads = paddle.static.append_backward(
                 out, parameter_list=[weight.name])[0]
 
-        exe = paddle.static.Executor(
-            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        if use_cuda:
+            place = paddle.CUDAPlace(0)
+        elif use_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
         exe.run(startup_prog)
         exe.run(train_prog,
                 feed={input.name: _prepare_data(1)},
@@ -128,7 +151,7 @@ def _run_static_single(use_cuda):
     paddle.disable_static()
 
 
-def _run_static_parallel(use_cuda, device_list):
+def _run_static_parallel(use_cuda, use_npu, device_list):
     """
     Testing the simple network in data parallel mode, using multiple CPU/GPU.
 
@@ -150,8 +173,15 @@ def _run_static_parallel(use_cuda, device_list):
             train_prog).with_data_parallel(
                 loss_name=loss.name, places=device_list)
 
-        exe = paddle.static.Executor(
-            paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace())
+        if use_cuda:
+            place = paddle.CUDAPlace(0)
+        elif use_npu:
+            place = paddle.NPUPlace(0)
+            compiled_prog = train_prog
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
         exe.run(startup_prog)
         exe.run(compiled_prog,
                 feed={input.name: _prepare_data(len(device_list))},
@@ -182,23 +212,31 @@ def run_check():
 
     if paddle.is_compiled_with_cuda():
         use_cuda = _is_cuda_available()
+        use_npu = False
+    elif paddle.is_compiled_with_npu():
+        use_npu = _is_npu_available()
+        use_cuda = False
     else:
+        use_npu = False
         use_cuda = False
 
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
+    elif use_npu:
+        device_str = "NPU"
+        device_list = paddle.static.npu_places()
     else:
         device_str = "CPU"
         device_list = paddle.static.cpu_places(device_count=2)
     device_count = len(device_list)
 
-    _run_static_single(use_cuda)
-    _run_dygraph_single(use_cuda)
+    _run_static_single(use_cuda, use_npu)
+    _run_dygraph_single(use_cuda, use_npu)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
-        _run_static_parallel(use_cuda, device_list)
+        _run_static_parallel(use_cuda, use_npu, device_list)
         print("PaddlePaddle works well on {} {}s.".format(device_count,
                                                           device_str))
         print(

From 71cb3ff805c1abc4762e6f302c7f8c46942e6f7c Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Mon, 11 Oct 2021 14:41:01 +0800
Subject: [PATCH 49/80] enhance yolobox  trt plugin (#34128)

* enhance yolobox plugin
---
 .../inference/tensorrt/convert/yolo_box_op.cc |  9 ++-
 .../tensorrt/plugin/yolo_box_op_plugin.cu     | 65 ++++++++++++++-----
 .../tensorrt/plugin/yolo_box_op_plugin.h      |  3 +
 .../ir/inference/test_trt_yolo_box_op.py      | 51 +++++++++++++++
 4 files changed, 111 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
index 2d12eaf736b75..17d217dff43fd 100644
--- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc
@@ -48,13 +48,20 @@ class YoloBoxOpConverter : public OpConverter {
     float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh"));
     bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox"));
     float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y"));
+    bool iou_aware = op_desc.HasAttr("iou_aware")
+                         ? BOOST_GET_CONST(bool, op_desc.GetAttr("iou_aware"))
+                         : false;
+    float iou_aware_factor =
+        op_desc.HasAttr("iou_aware_factor")
+            ? BOOST_GET_CONST(float, op_desc.GetAttr("iou_aware_factor"))
+            : 0.5;
 
     int type_id = static_cast<int>(engine_->WithFp16());
     auto input_dim = X_tensor->getDimensions();
     auto* yolo_box_plugin = new plugin::YoloBoxPlugin(
         type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT,
         anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y,
-        input_dim.d[1], input_dim.d[2]);
+        iou_aware, iou_aware_factor, input_dim.d[1], input_dim.d[2]);
 
     std::vector<nvinfer1::ITensor*> yolo_box_inputs;
     yolo_box_inputs.push_back(X_tensor);
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index 10123cd4fa0e1..57177cfa8b421 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <cuda_fp16.h>
-#include <cuda_runtime.h>
 #include <algorithm>
 #include <cassert>
 
@@ -29,7 +27,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
                              const std::vector<int>& anchors,
                              const int class_num, const float conf_thresh,
                              const int downsample_ratio, const bool clip_bbox,
-                             const float scale_x_y, const int input_h,
+                             const float scale_x_y, const bool iou_aware,
+                             const float iou_aware_factor, const int input_h,
                              const int input_w)
     : data_type_(data_type),
       class_num_(class_num),
@@ -37,6 +36,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
       downsample_ratio_(downsample_ratio),
       clip_bbox_(clip_bbox),
       scale_x_y_(scale_x_y),
+      iou_aware_(iou_aware),
+      iou_aware_factor_(iou_aware_factor),
       input_h_(input_h),
       input_w_(input_w) {
   anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend());
@@ -45,6 +46,7 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type,
   assert(class_num_ > 0);
   assert(input_h_ > 0);
   assert(input_w_ > 0);
+  assert((iou_aware_factor_ > 0 && iou_aware_factor_ < 1));
 
   cudaMalloc(&anchors_device_, anchors.size() * sizeof(int));
   cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int),
@@ -59,6 +61,8 @@ YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) {
   DeserializeValue(&data, &length, &downsample_ratio_);
   DeserializeValue(&data, &length, &clip_bbox_);
   DeserializeValue(&data, &length, &scale_x_y_);
+  DeserializeValue(&data, &length, &iou_aware_);
+  DeserializeValue(&data, &length, &iou_aware_factor_);
   DeserializeValue(&data, &length, &input_h_);
   DeserializeValue(&data, &length, &input_w_);
 }
@@ -133,8 +137,19 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors,
 
 __device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
                                     int an_num, int an_stride, int stride,
-                                    int entry) {
-  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+                                    int entry, bool iou_aware) {
+  if (iou_aware) {
+    return (batch * an_num + an_idx) * an_stride +
+           (batch * an_num + an_num + entry) * stride + hw_idx;
+  } else {
+    return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+  }
+}
+
+__device__ inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                  int an_stride, int stride) {
+  return batch * an_num * an_stride + (batch * an_num + an_idx) * stride +
+         hw_idx;
 }
 
 template <typename T>
@@ -178,7 +193,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
                             const int w, const int an_num, const int class_num,
                             const int box_num, int input_size_h,
                             int input_size_w, bool clip_bbox, const float scale,
-                            const float bias) {
+                            const float bias, bool iou_aware,
+                            const float iou_aware_factor) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   float box[4];
@@ -193,11 +209,16 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
     int img_height = imgsize[2 * i];
     int img_width = imgsize[2 * i + 1];
 
-    int obj_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4,
+                                iou_aware);
     float conf = sigmoid(static_cast<float>(input[obj_idx]));
-    int box_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    if (iou_aware) {
+      int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num);
+      float iou = sigmoid<float>(input[iou_idx]);
+      conf = powf(conf, 1. - iou_aware_factor) * powf(iou, iou_aware_factor);
+    }
+    int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0,
+                                iou_aware);
 
     if (conf < conf_thresh) {
       for (int i = 0; i < 4; ++i) {
@@ -212,8 +233,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
     CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width, clip_bbox);
 
-    int label_idx =
-        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num,
+                                  5, iou_aware);
     int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
     CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
                       grid_num);
@@ -240,7 +261,8 @@ int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs,
       reinterpret_cast<const int* const>(inputs[1]),
       reinterpret_cast<T*>(outputs[0]), reinterpret_cast<T*>(outputs[1]),
       conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num,
-      input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias);
+      input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias, iou_aware_,
+      iou_aware_factor_);
   return cudaGetLastError() != cudaSuccess;
 }
 
@@ -274,6 +296,8 @@ size_t YoloBoxPlugin::getSerializationSize() const TRT_NOEXCEPT {
   serialize_size += SerializedSize(scale_x_y_);
   serialize_size += SerializedSize(input_h_);
   serialize_size += SerializedSize(input_w_);
+  serialize_size += SerializedSize(iou_aware_);
+  serialize_size += SerializedSize(iou_aware_factor_);
   return serialize_size;
 }
 
@@ -285,6 +309,8 @@ void YoloBoxPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, downsample_ratio_);
   SerializeValue(&buffer, clip_bbox_);
   SerializeValue(&buffer, scale_x_y_);
+  SerializeValue(&buffer, iou_aware_);
+  SerializeValue(&buffer, iou_aware_factor_);
   SerializeValue(&buffer, input_h_);
   SerializeValue(&buffer, input_w_);
 }
@@ -326,8 +352,8 @@ void YoloBoxPlugin::configurePlugin(
 
 nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const TRT_NOEXCEPT {
   return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_,
-                           downsample_ratio_, clip_bbox_, scale_x_y_, input_h_,
-                           input_w_);
+                           downsample_ratio_, clip_bbox_, scale_x_y_,
+                           iou_aware_, iou_aware_factor_, input_h_, input_w_);
 }
 
 YoloBoxPluginCreator::YoloBoxPluginCreator() {}
@@ -367,6 +393,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
   float scale_x_y = 1.;
   int h = -1;
   int w = -1;
+  bool iou_aware = false;
+  float iou_aware_factor = 0.5;
 
   for (int i = 0; i < fc->nbFields; ++i) {
     const std::string field_name(fc->fields[i].name);
@@ -386,6 +414,10 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
       clip_bbox = *static_cast<const bool*>(fc->fields[i].data);
     } else if (field_name.compare("scale_x_y")) {
       scale_x_y = *static_cast<const float*>(fc->fields[i].data);
+    } else if (field_name.compare("iou_aware")) {
+      iou_aware = *static_cast<const bool*>(fc->fields[i].data);
+    } else if (field_name.compare("iou_aware_factor")) {
+      iou_aware_factor = *static_cast<const float*>(fc->fields[i].data);
     } else if (field_name.compare("h")) {
       h = *static_cast<const int*>(fc->fields[i].data);
     } else if (field_name.compare("w")) {
@@ -397,7 +429,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
 
   return new YoloBoxPlugin(
       type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors,
-      class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w);
+      class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, iou_aware,
+      iou_aware_factor, h, w);
 }
 
 nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin(
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
index c9e9f9a0567ae..ae9a6739cedd3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
@@ -31,6 +31,7 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
                          const std::vector<int>& anchors, const int class_num,
                          const float conf_thresh, const int downsample_ratio,
                          const bool clip_bbox, const float scale_x_y,
+                         const bool iou_aware, const float iou_aware_factor,
                          const int input_h, const int input_w);
   YoloBoxPlugin(const void* data, size_t length);
   ~YoloBoxPlugin() override;
@@ -89,6 +90,8 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
   float scale_x_y_;
   int input_h_;
   int input_w_;
+  bool iou_aware_;
+  float iou_aware_factor_;
   std::string namespace_;
 };
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
index 2166bbaa98b2f..b0124f055b4e1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py
@@ -116,5 +116,56 @@ def test_check_output(self):
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
 
 
+class TRTYoloBoxIoUAwareTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            image_shape = [self.bs, self.channel, self.height, self.width]
+            image = fluid.data(name='image', shape=image_shape, dtype='float32')
+            image_size = fluid.data(
+                name='image_size', shape=[self.bs, 2], dtype='int32')
+            boxes, scores = self.append_yolobox(image, image_size)
+
+        self.feeds = {
+            'image': np.random.random(image_shape).astype('float32'),
+            'image_size': np.random.randint(
+                32, 64, size=(self.bs, 2)).astype('int32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTYoloBoxTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [scores, boxes]
+
+    def set_params(self):
+        self.bs = 4
+        self.channel = 258
+        self.height = 64
+        self.width = 64
+        self.class_num = 80
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        self.conf_thresh = .1
+        self.downsample_ratio = 32
+        self.iou_aware = True
+        self.iou_aware_factor = 0.5
+
+    def append_yolobox(self, image, image_size):
+        return fluid.layers.yolo_box(
+            x=image,
+            img_size=image_size,
+            class_num=self.class_num,
+            anchors=self.anchors,
+            conf_thresh=self.conf_thresh,
+            downsample_ratio=self.downsample_ratio,
+            iou_aware=self.iou_aware,
+            iou_aware_factor=self.iou_aware_factor)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
 if __name__ == "__main__":
     unittest.main()

From 414c252ae79fa2ca31b2159d3b2c56e491d55cd4 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 11 Oct 2021 16:48:56 +0800
Subject: [PATCH 50/80] Fix, test=document_fix (#36336)

---
 paddle/scripts/paddle_build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 0c2580929081d..2cc4bd8d05fb8 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1076,7 +1076,6 @@ function get_quickly_disable_ut() {
 
 function card_test() {
     set -m
-    echo "$2 bengingggggg!!!!!"
     case_count $1 $2
     ut_startTime_s=`date +%s` 
 

From 7a724ddb30c677b994b907e967b308a42ac8c7ad Mon Sep 17 00:00:00 2001
From: yaoxuefeng <yaoxuefeng@baidu.com>
Date: Mon, 11 Oct 2021 17:02:01 +0800
Subject: [PATCH 51/80] fix multi-node (#36329)

---
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 10 +++++++++-
 paddle/fluid/platform/collective_helper.cc    |  8 ++++----
 python/paddle/fluid/dataset.py                |  2 ++
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index b7e8bbb369492..fa2ff6cbdb8c7 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -117,6 +117,15 @@ class PSGPUWrapper {
       resource_ = std::make_shared<HeterPsResource>(dev_ids);
       resource_->enable_p2p();
       keys_tensor.resize(resource_->total_gpu());
+#ifdef PADDLE_WITH_GLOO
+      auto gloo = paddle::framework::GlooWrapper::GetInstance();
+      if (gloo->Size() > 1) {
+        multi_node_ = 1;
+      }
+#else
+      PADDLE_THROW(
+          platform::errors::Unavailable("heter ps need compile with GLOO"));
+#endif
       if (multi_node_) {
         int dev_size = dev_ids.size();
         // init inner comm
@@ -127,7 +136,6 @@ class PSGPUWrapper {
 // init inter comm
 #ifdef PADDLE_WITH_GLOO
         inter_comms_.resize(dev_size);
-        auto gloo = paddle::framework::GlooWrapper::GetInstance();
         if (gloo->Rank() == 0) {
           for (int i = 0; i < dev_size; ++i) {
             platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]);
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index a765f344daf8a..03359d932b5ab 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -148,7 +148,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
       paddle::platform::errors::InvalidArgument(
           "dev ids = [%d], it should greater than 0.", dev_ids.size()));
   const int kDevices = dev_ids.size();
-  VLOG(3) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices
+  VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices
           << ", ntrainers: " << ntrainers << ", train_id: " << train_id
           << ", rind_id: " << ring_id;
   ncclComm_t comms[kDevices];
@@ -162,10 +162,10 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
 #endif
       platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers,
                                           *nccl_id, train_id * kDevices + i);
-      VLOG(3) << "ncclCommInitRank: " << i;
+      VLOG(1) << "ncclCommInitRank: " << i;
     }
     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
-    VLOG(3) << "nccl group end seccessss";
+    VLOG(1) << "nccl group end seccessss";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0,
                     platform::errors::InvalidArgument(
@@ -174,7 +174,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   for (int i = 0; i < kDevices; ++i) {
     AssignNCCLComm(comms[i], kDevices * ntrainers, train_id * kDevices + i,
                    dev_ids[i], ring_id);
-    VLOG(3) << "nccl communicator of train_id " << train_id * kDevices + i
+    VLOG(1) << "nccl communicator of train_id " << train_id * kDevices + i
             << " in ring " << ring_id << " has been created on device "
             << dev_ids[i];
   }
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index 438831208b66a..d683e36fbe5ab 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -396,6 +396,8 @@ def set_feed_type(self, data_feed_type):
         Set data_feed_desc
         """
         self.proto_desc.name = data_feed_type
+        if (self.proto_desc.name == "SlotRecordInMemoryDataFeed"):
+            self.dataset = core.Dataset("SlotRecordDataset")
 
     @deprecated(
         since="2.0.0",

From c38b04883e8b3079d8321b5cce03f9ec07df1fd1 Mon Sep 17 00:00:00 2001
From: caozhou <48191911+Caozhou1995@users.noreply.github.com>
Date: Mon, 11 Oct 2021 17:45:18 +0800
Subject: [PATCH 52/80] add reshard module (#35779)

* add reshard module

* fix conflict

* update reshard module

* update and add unitest

* update reshard module and unitest

* add more unitests
---
 .../distributed/auto_parallel/__init__.py     |    2 +
 .../distributed/auto_parallel/completion.py   |  170 +++
 .../distributed/auto_parallel/context.py      |    3 +
 .../auto_parallel/operators/dist_embedding.py |   14 +-
 .../distributed/auto_parallel/parallelizer.py |    9 +-
 .../distributed/auto_parallel/reshard.py      | 1002 +++++++++++++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   12 +
 .../unittests/test_auto_parallel_reshard.py   |  287 +++++
 .../test_auto_parallel_reshard_dpmppp.py      |  173 +++
 .../test_auto_parallel_reshard_mppp.py        |  231 ++++
 .../test_auto_parallel_reshard_serial.py      |  184 +++
 11 files changed, 2083 insertions(+), 4 deletions(-)
 create mode 100644 python/paddle/distributed/auto_parallel/reshard.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py

diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 5b0fdc1f1f166..31f92e2575a1f 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -19,5 +19,7 @@
 from .interface import set_pipeline_stage  # noqa: F401
 from .interface import ProcessMesh  # noqa: F401
 from .completion import complete_annotation  # noqa: F401
+from .completion import complete_backward_annotation  # noqa: F401
+from .reshard import reshard  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 6e886d09d67bd..3fdbad6950db5 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -23,6 +23,7 @@
 from .utils import print_program_with_distributed_attr
 from .context import get_default_distributed_context
 from .operators import find_best_compatible_distributed_operator_impl
+from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute
 
 ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"]
 
@@ -597,3 +598,172 @@ def sort_key_fun(node):
     dist_context.amend_distributed_attr_for_program()
 
     return program
+
+
+def complete_backward_annotation(auto_parallel_main_prog, dist_context):
+    """Complete the annotation of vars and ops in the backward phase for parallel program."""
+
+    def _is_grad_var_name(name):
+        if "@GRAD" in name:
+            return True
+        return False
+
+    grad_start_idx = None
+    for idx, op in enumerate(auto_parallel_main_prog.global_block().ops):
+        for var_name in op.output_arg_names:
+            # TODO: use _is_loss_op to judge
+            if "@GRAD" in var_name and op.type == "fill_constant":
+                grad_start_idx = idx
+                break
+    assert grad_start_idx is not None, "No backward procedure found in this program."
+
+    ops = list(auto_parallel_main_prog.global_block().ops)
+    vars = auto_parallel_main_prog.global_block().vars
+    for idx in range(grad_start_idx, len(ops)):
+        # complete the loss op
+        if idx == grad_start_idx:
+            grad_var = vars[ops[idx].output_arg_names[0]]
+            grad_var_name = grad_var.name
+            forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")]
+            forward_var = vars[forward_var_name]
+            tensor_attr = TensorDistributedAttribute(grad_var, dist_context)
+            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
+                forward_var).get_process_mesh()
+            dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
+                forward_var).get_dims_mapping()
+            tensor_attr.set_dims_mapping(dims_mapping)
+            tensor_attr.set_process_mesh(process_mesh)
+            dist_context.set_tensor_distributed_attr_for_program(grad_var,
+                                                                 tensor_attr)
+            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
+            op_attr.set_process_mesh(process_mesh)
+            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
+
+            # in the data parallel mode, the loss op followed by scale op.
+            if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \
+                    and grad_var_name in ops[idx + 1].output_arg_names:
+                op_attr = OperatorDistributedAttribute(ops[idx + 1],
+                                                       dist_context)
+                op_attr.set_process_mesh(process_mesh)
+                dist_context.set_op_distributed_attr_for_program(ops[idx + 1],
+                                                                 op_attr)
+            continue
+
+        # complete the annotation of the optimizer op.
+        # TODO: use _is_optimizer_op to judge
+        if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names:
+            assert len(ops[idx].input(
+                "Param")) == 1, "Only support one-to-one now."
+            assert len(ops[idx].input(
+                "Grad")) == 1, "Only support one-to-one now."
+            var = vars[ops[idx].input("Param")[0]]
+            grad_var = vars[ops[idx].input("Grad")[0]]
+            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
+                var).get_process_mesh()
+            dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
+                var).get_dims_mapping()
+            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
+            op_attr.set_process_mesh(process_mesh)
+            op_attr.set_input_dims_mapping(grad_var.name, dims_mapping)
+            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
+            continue
+
+        # complete the c_allreduce_sum op for gradient in the data parallel mode.
+        if ops[idx].type == "c_allreduce_sum" and ops[
+                idx].input_arg_names == ops[idx].output_arg_names:
+            grad_var = vars[ops[idx].output_arg_names[0]]
+            op_attr = OperatorDistributedAttribute(ops[idx], dist_context)
+            process_mesh = dist_context.get_tensor_distributed_attr_for_program(
+                grad_var).get_process_mesh()
+            op_attr.set_process_mesh(process_mesh)
+            dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr)
+            continue
+
+        # complete the annotation of grad op
+        grad_op = ops[idx]
+        for i, op in enumerate(ops[:grad_start_idx]):
+            match_op = None
+            grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc,
+                                                                      set(),
+                                                                      [])
+            grad_op_input = []
+            for input_arg_name in grad_op.desc.input_arg_names():
+                if "@GRAD" in input_arg_name:
+                    name = input_arg_name[:input_arg_name.find("@GRAD") + 5]
+                    grad_op_input.append(name)
+                else:
+                    grad_op_input.append(input_arg_name)
+
+            # like sum op: the count of grad op will larger than 1
+            if len(grad_op_desc_list) > 1:
+                for grad_op_desc in grad_op_desc_list:
+                    if grad_op_input == grad_op_desc.input_arg_names() \
+                            and grad_op.desc.type() == grad_op_desc.type():
+                        match_op = op
+                        break
+            elif len(grad_op_desc_list) == 1:
+                if grad_op_input == grad_op_desc_list[0].input_arg_names() \
+                        and grad_op.desc.type() == grad_op_desc_list[0].type():
+                    match_op = op
+
+            if match_op is not None:
+                op_attr = dist_context.get_op_distributed_attr_for_program(op)
+                grad_op_attr = OperatorDistributedAttribute(grad_op,
+                                                            dist_context)
+                grad_op_attr.set_process_mesh(op_attr.get_process_mesh())
+                for var_name in grad_op.input_arg_names:
+                    if "@GRAD" in var_name:
+                        dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
+                            vars[var_name]).get_dims_mapping()
+                        grad_op_attr.set_input_dims_mapping(var_name,
+                                                            dims_mapping)
+                    else:
+                        dims_mapping = op_attr.get_input_dims_mapping(var_name)
+                        grad_op_attr.set_input_dims_mapping(var_name,
+                                                            dims_mapping)
+                dist_context.set_op_distributed_attr_for_program(grad_op,
+                                                                 grad_op_attr)
+
+                for var_name in grad_op.output_arg_names:
+                    if "@GRAD" in var_name:
+                        forward_var = vars[var_name[:var_name.find("@GRAD")]]
+                        tensor_attr = TensorDistributedAttribute(vars[var_name],
+                                                                 dist_context)
+                        process_mesh = grad_op_attr.get_process_mesh()
+                        dims_mapping = grad_op_attr.get_input_dims_mapping(
+                            forward_var.name)
+                        tensor_attr.set_process_mesh(process_mesh)
+                        tensor_attr.set_dims_mapping(dims_mapping)
+                        dist_context.set_tensor_distributed_attr_for_program(
+                            vars[var_name], tensor_attr)
+                break
+
+        # complete the annotation of sum op for multiple renamed grad var
+        if grad_op.type == "sum" and all(
+                map(_is_grad_var_name, grad_op.input_arg_names)):
+            assert len(grad_op.output_arg_names
+                       ) == 1, "The output count of sum op should be one."
+            grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context)
+            for var_name in grad_op.input_arg_names:
+                if "@GRAD" in var_name:
+                    forward_var = vars[var_name[:var_name.find("@GRAD")]]
+                    dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
+                        forward_var).get_dims_mapping()
+                    grad_op_attr.set_input_dims_mapping(var_name, dims_mapping)
+            for var_name in grad_op.output_arg_names:
+                forward_var = vars[var_name[:var_name.find("@GRAD")]]
+                tensor_attr = TensorDistributedAttribute(vars[var_name],
+                                                         dist_context)
+                process_mesh = dist_context.get_tensor_distributed_attr_for_program(
+                    forward_var).get_process_mesh()
+                dims_mapping = dist_context.get_tensor_distributed_attr_for_program(
+                    forward_var).get_dims_mapping()
+                tensor_attr.set_dims_mapping(dims_mapping)
+                tensor_attr.set_process_mesh(process_mesh)
+                dist_context.set_tensor_distributed_attr_for_program(
+                    vars[var_name], tensor_attr)
+                grad_op_attr.set_process_mesh(
+                    dist_context.get_tensor_distributed_attr_for_program(
+                        forward_var).get_process_mesh())
+            dist_context.set_op_distributed_attr_for_program(grad_op,
+                                                             grad_op_attr)
diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py
index 4958c5adfae91..5e6565aa3d84c 100644
--- a/python/paddle/distributed/auto_parallel/context.py
+++ b/python/paddle/distributed/auto_parallel/context.py
@@ -59,6 +59,9 @@ def __init__(self):
             if self._process_mesh.ndim == 1:
                 self._data_parallel_axis = 0
                 self._model_parallel_axis = 0
+            elif self._process_mesh.ndim == 3:
+                self._data_parallel_axis = 1
+                self._model_parallel_axis = 2
             else:
                 self._data_parallel_axis = 0
                 self._model_parallel_axis = 1
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
index 141c3d14a7fb2..3f8fbf9cc3a7a 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py
@@ -146,8 +146,18 @@ def static_handle(dst_block,
             assert mesh_shape <= 2, "row_parallel_embedding only support 1 or 2 dimensional process mesh, but got {}".format(
                 process_mesh_shape)
             num_partition = process_mesh_shape[embedding_row_dim_mapping]
-            # TODO generalize here, support any mesh group 
+            # TODO generalize here, support any mesh group
+            model_parallel_axis, process_mesh = op_dist_attr.get_owner_context(
+            )._get_model_parallel_info()
             if mesh_shape == 1:
+                if rank_id not in process_mesh_group:
+                    assert len(
+                        process_mesh.topology
+                    ) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \
+                    but got {}".format(len(process_mesh.topology))
+                    rank_id = process_mesh_group[
+                        process_mesh.process_group.index(rank_id) %
+                        process_mesh_shape[0]]
                 relative_idx = process_mesh_group.index(rank_id)
             else:
                 relative_idx = rank_id % num_partition
@@ -156,8 +166,6 @@ def static_handle(dst_block,
             relative_idx = relative_idx * per_part_size
 
             # TODO caculate ring id 
-            model_parallel_axis, process_mesh = op_dist_attr.get_owner_context(
-            )._get_model_parallel_info()
             group_ranks = _get_comm_group(process_mesh.process_group,
                                           process_mesh.topology,
                                           model_parallel_axis, rank_id)
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index a08da13a39caf..2994d35ef9202 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -17,9 +17,10 @@
 import paddle.fluid.core as core
 from .context import DistributedContext
 from .context import get_default_distributed_context
-from .completion import complete_annotation
+from .completion import complete_annotation, complete_backward_annotation
 from .partitioner import Partitioner
 from .process import get_all_process_groups
+from .reshard import reshard
 
 
 class AutoParallelizer:
@@ -85,10 +86,16 @@ def parallelize(self,
         # instantiate communication by process_mapping.
         all_process_groups = get_all_process_groups()
         for process_group in all_process_groups:
+            if rank not in process_group._ranks:
+                continue
             process_group.instantiate()
 
         # The last step: remove all distributed attributes to be compatiable
         # with inference.
         self._remove_distributed_attrs(partitioned_main_prog)
 
+        complete_backward_annotation(partitioned_main_prog, self._dist_context)
+        reshard(partitioned_main_prog, partitioned_startup_prog, rank,
+                self._dist_context)
+
         return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
new file mode 100644
index 0000000000000..d66d799c6e0f9
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -0,0 +1,1002 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import copy
+from functools import reduce
+
+import paddle
+import paddle.fluid.core as core
+from paddle.utils import unique_name
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import Program, OpProtoHolder
+import paddle.fluid.layers.utils as utils
+from ..collective import _get_global_env
+from .context import DistributedContext
+from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from .process import new_process_group, ProcessGroup, PROCESS_GROUP_MAP
+
+
+class AllGatherOpDesc:
+    """
+    Describe the allgather op in the reshard phase.
+
+    Args:
+        group (list): Process group.
+    """
+
+    def __init__(self, group):
+        self._group = group
+        self._desc = "all_gather"
+
+    @property
+    def group(self):
+        return self._group
+
+    @property
+    def desc(self):
+        return self._desc
+
+    def __repr__(self):
+        return f"op: {self._desc}, group: {self._group}."
+
+
+class SendOpDesc:
+    """
+    Describe the send op in the reshard phase.
+
+    Args:
+        partition_index (list): The index of partition in complete tensor.
+        dst (int): The destination process to receive.
+    """
+
+    def __init__(self, partition_index, dst):
+        self._dst = dst
+        self._partition_index = partition_index
+        self._desc = "send"
+
+    @property
+    def partition_index(self):
+        return self._partition_index
+
+    @property
+    def dst(self):
+        return self._dst
+
+    @property
+    def desc(self):
+        return self._desc
+
+    def __repr__(self):
+        return f"op: {self._desc}, partition_index: {self._partition_index}, dst: {self._dst}."
+
+
+class RecvOpDesc:
+    """
+    Describe the recv op in the reshard op.
+
+    Args:
+        partition_index (list): The index of partition in complete tensor.
+        src (int): The source process to send.
+    """
+
+    def __init__(self, partition_index, src):
+        self._src = src
+        self._partition_index = partition_index
+        self._desc = "recv"
+
+    @property
+    def partition_index(self):
+        return self._partition_index
+
+    @property
+    def src(self):
+        return self._src
+
+    @property
+    def desc(self):
+        return self._desc
+
+    def __repr__(self):
+        return f"op: {self._desc}, partition_index: {self._partition_index}, src: {self._src}."
+
+
+class SliceOpDesc:
+    """
+    Describe the slice op in the reshard phase.
+
+    Args:
+        starts (list): It represents starting indices of corresponding axis in ``axes``.
+        ends (list):  It represents ending indices of corresponding axis in ``axes``.
+        axes (list):  Axes that `starts` and `ends` apply to .
+    """
+
+    def __init__(self, starts, ends, axes):
+        self._starts = starts
+        self._ends = ends
+        self._axes = axes
+        self._desc = "slice"
+
+    @property
+    def starts(self):
+        return self._starts
+
+    @property
+    def ends(self):
+        return self._ends
+
+    @property
+    def axes(self):
+        return self._axes
+
+    @property
+    def desc(self):
+        return self._desc
+
+    def __repr__(self):
+        return f"op: {self._desc}, starts: {self._starts}, ends: {self._ends}, axes: {self._axes}."
+
+
+class ConcatOpDesc:
+    """
+    Describe the concat op in the reshard phase.
+
+    Args:
+        partition_index_list (list): A list contains all partition index.
+    """
+
+    def __init__(self, partition_index_list):
+        self._partition_index_list = partition_index_list
+        self._desc = "concat"
+
+    @property
+    def partition_index_list(self):
+        return self._partition_index_list
+
+    @property
+    def desc(self):
+        return self._desc
+
+    def __repr__(self):
+        return f"op: {self._desc}, partition_index_list: {self._partition_index_list}."
+
+
+def _compute_partition_shape(complete_shape, dims_mapping, process_shape):
+    """Compute the shape of partition."""
+    partition_shape = []
+    for idx, item in enumerate(complete_shape):
+        if dims_mapping[idx] == -1:
+            partition_shape.append(item)
+        else:
+            partition_shape.append(item // process_shape[dims_mapping[idx]])
+
+    return partition_shape
+
+
+def _compute_process_index(process, process_group, process_shape):
+    """Compute the index of process_shape corresponding to the process."""
+    relative_process = process_group.index(process)
+    process_index = []
+    product = reduce(lambda x, y: x * y, process_shape)
+
+    for i in range(len(process_shape)):
+        idx = relative_process // (product // process_shape[i])
+        product = product // process_shape[i]
+        relative_process = relative_process - relative_process // product * product
+        process_index.append(idx)
+
+    return process_index
+
+
+def _compute_partition_index(process, complete_shape, dims_mapping,
+                             process_shape, process_group):
+    """Compute the partition index in complete tensor."""
+    partition_shape = _compute_partition_shape(complete_shape, dims_mapping,
+                                               process_shape)
+    process_index = _compute_process_index(process, process_group,
+                                           process_shape)
+    partition_index = []
+
+    for i in range(len(complete_shape)):
+        if dims_mapping[i] == -1:
+            partition_index.append([0, partition_shape[i]])
+        else:
+            partition_index.append([
+                process_index[dims_mapping[i]] * partition_shape[i],
+                (process_index[dims_mapping[i]] + 1) * partition_shape[i]
+            ])
+
+    return partition_index
+
+
+def _compute_concat_info(partition_index_x, partition_index_y):
+    """Judge whether two partition can be concatenated and compute concatenated partition index."""
+    differ_count = 0
+    concat_axis = -1
+    first_order = 0
+    new_partition = []
+
+    for idx, item in enumerate(partition_index_x):
+        if item != partition_index_y[idx]:
+            differ_count += 1
+            if item[1] == partition_index_y[idx][0] and item[
+                    0] < partition_index_y[idx][1]:
+                concat_axis = idx
+                new_partition.append([item[0], partition_index_y[idx][1]])
+            elif item[0] == partition_index_y[idx][1] and item[
+                    1] > partition_index_y[idx][0]:
+                first_order = 1
+                concat_axis = idx
+                new_partition.append([partition_index_y[idx][0], item[1]])
+        else:
+            new_partition.append(item)
+
+    if differ_count == 1:
+        return concat_axis, first_order, new_partition
+    else:
+        return -1, first_order, new_partition
+
+
+def _concat_partitions(partition_index_list, partition_index):
+    """Concat the given partitions without inserting concat op."""
+    if not partition_index_list:
+        partition_index_list.append(partition_index)
+    else:
+        i = 0
+        has_concat = False
+        while i < len(partition_index_list):
+            concat_axis, _, new_partition = _compute_concat_info(
+                partition_index_list[i], partition_index)
+            if concat_axis != -1:
+                has_concat = True
+                partition_index_list.pop(i)
+                _concat_partitions(partition_index_list, new_partition)
+                break
+            i += 1
+        if not has_concat:
+            partition_index_list.append(partition_index)
+
+
+def _is_overlapped(shape_x, shape_y):
+    """Judge whether two partitions intersect on the specified dimension."""
+    overlapped = False
+    if (shape_y[0] <= shape_x[0] < shape_y[1]) or (
+            shape_x[0] <= shape_y[0] < shape_x[1]):
+        overlapped = True
+    return overlapped
+
+
+def _need_reshard(tensor_dist_attr, op_dist_attr):
+    """Judge the tensor whether needs to be resharded."""
+    is_reshard = False
+    tensor_dims_mapping = tensor_dist_attr.get_dims_mapping()
+    tensor_process_mesh = tensor_dist_attr.get_process_mesh()
+    op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
+        tensor_dist_attr.get_owner_tensor().name)
+    op_process_mesh = op_dist_attr.get_process_mesh()
+    if all(
+            map(lambda x: x is not None, [
+                tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping,
+                op_process_mesh
+            ])):
+        if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh._id != op_process_mesh._id:
+            is_reshard = True
+    return is_reshard
+
+
+def _compute_complete_shape(slice_shape, process_shape, dims_mapping):
+    """compute the complete shape of the slice tensor  with its process mesh and dims mapping"""
+    complete_shape = []
+    for idx, item in enumerate(slice_shape):
+        if dims_mapping[idx] == -1:
+            complete_shape.append(item)
+        else:
+            complete_shape.append(item * process_shape[dims_mapping[idx]])
+    return complete_shape
+
+
+def find_op_desc_seq(source_tensor, tensor_dist_attr, op_dist_attr):
+    """
+    Find the op description sequence to reshard the source tensor for matching the op requirement.
+
+    Args:
+        source_tensor (Variable): A tensor with distributed attribute.
+        tensor_dist_attr (TensorDistributedAttribute): The distributed attribute of tensor.
+        op_dist_attr (OperatorDistributedAttribute): The distributed attribute of operator.
+
+    Returns:
+        Dict, the dict represents the required op description sequence corresponding to process, The key of dict is
+        process and value is a list containing op description.
+    """
+    source_dims_mapping = tensor_dist_attr.get_dims_mapping()
+    source_process_mesh = tensor_dist_attr.get_process_mesh()
+    source_process_group = source_process_mesh.process_group
+    source_process_shape = source_process_mesh.topology
+
+    target_process_mesh = op_dist_attr.get_process_mesh()
+    target_dims_mapping = op_dist_attr.get_input_dims_mapping(
+        tensor_dist_attr.get_owner_tensor().name)
+    target_process_group = target_process_mesh.process_group
+    target_process_shape = target_process_mesh.topology
+
+    complete_shape = _compute_complete_shape(
+        source_tensor.shape, source_process_shape, source_dims_mapping)
+    op_desc_seq = {}
+
+    # TODO: if the target process group has the same process with source process group
+    if set(target_process_group).intersection(set(
+            source_process_group)) and set(target_process_group).difference(
+                set(source_process_group)):
+        pass
+
+    # in the different process group, it will use send, recv, concat and slice op
+    elif target_process_group != source_process_group:
+        partition_process_mapping_list = []
+        for source_process in source_process_group:
+            source_partition_index = _compute_partition_index(source_process, complete_shape, source_dims_mapping, \
+                                                              source_process_shape, source_process_group)
+            if not partition_process_mapping_list:
+                partition_process_mapping_list.append(
+                    [source_partition_index, [source_process], [False]])
+            else:
+                partition_list = list(
+                    [item[0] for item in partition_process_mapping_list])
+                process_list = list(
+                    [item[1] for item in partition_process_mapping_list])
+                has_used = list(
+                    [item[2] for item in partition_process_mapping_list])
+                if partition_list.count(source_partition_index) == 1:
+                    index = partition_list.index(source_partition_index)
+                    process_list[index].append(source_process)
+                    has_used[index].append(False)
+                else:
+                    partition_process_mapping_list.append(
+                        [source_partition_index, [source_process], [False]])
+
+        for target_process in target_process_group:
+            has_sent = []
+            target_partition_index = _compute_partition_index(
+                target_process, complete_shape, target_dims_mapping,
+                target_process_shape, target_process_group)
+            partition_index_list = []
+            all_partition_index_list = []
+            for source_process in source_process_group:
+                source_partition_index = _compute_partition_index(
+                    source_process, complete_shape, source_dims_mapping,
+                    source_process_shape, source_process_group)
+                to_send_process = None
+                if all(_ for _ in list(map(_is_overlapped, source_partition_index, target_partition_index))) \
+                        and source_partition_index not in has_sent:
+                    idx = list([
+                        item[0] for item in partition_process_mapping_list
+                    ]).index(source_partition_index)
+                    has_used = list(
+                        [item[2]
+                         for item in partition_process_mapping_list])[idx]
+                    process_list = list(
+                        [item[1]
+                         for item in partition_process_mapping_list])[idx]
+                    i = 0
+                    while i < len(has_used):
+                        if not has_used[i]:
+                            to_send_process = process_list[i]
+                            has_used[i] = True
+                            break
+                        i += 1
+                    if i == len(has_used):
+                        has_used = list(map(lambda x: False, has_used))
+                        to_send_process = process_list[0]
+                        has_used[0] = True
+                    assert to_send_process is not None, "Failed to find the send process."
+
+                    if to_send_process not in op_desc_seq.keys():
+                        op_desc_seq[to_send_process] = []
+                    if target_process not in op_desc_seq.keys():
+                        op_desc_seq[target_process] = []
+                    all_partition_index_list.append(source_partition_index)
+
+                    # append send and recv op desc
+                    send_op_desc = SendOpDesc(source_partition_index,
+                                              target_process)
+                    recv_op_desc = RecvOpDesc(source_partition_index,
+                                              to_send_process)
+                    op_desc_seq[to_send_process].append(send_op_desc)
+                    op_desc_seq[target_process].append(recv_op_desc)
+                    has_sent.append(source_partition_index)
+                    _concat_partitions(partition_index_list,
+                                       source_partition_index)
+
+            # append concat op desc
+            op_desc_seq[target_process].append(
+                ConcatOpDesc(all_partition_index_list))
+
+            # append slice op desc
+            slice_starts = []
+            slice_ends = []
+            slices_axes = []
+            concatenated_partition_index = partition_index_list[0]
+            for idx, item in enumerate(concatenated_partition_index):
+                slice_starts.append(target_partition_index[idx][0] - item[0])
+                slice_ends.append(target_partition_index[idx][1] - item[0])
+                slices_axes.append(idx)
+            op_desc_seq[target_process].append(
+                SliceOpDesc(slice_starts, slice_ends, slices_axes))
+
+    # in the same process group, it will use allgahther and slice op
+    else:
+        partition_index_list = []
+        all_partition_index_list = []
+        process_index = []
+        for source_process in source_process_group:
+            source_partition_index = _compute_partition_index(
+                source_process, complete_shape, source_dims_mapping,
+                source_process_shape, source_process_group)
+            if source_partition_index not in partition_index_list:
+                partition_index_list.append(source_partition_index)
+                process_index.append(
+                    [[source_process, ], source_partition_index])
+            else:
+                process_index[partition_index_list.index(
+                    source_partition_index)][0].append(source_process)
+
+        for i in range(len(process_index[0][0])):
+            group = []
+            for j in range(len(process_index)):
+                group.append(process_index[j][0][i])
+                if i == 0:
+                    all_partition_index_list.append(process_index[j][1])
+            for process in group:
+                # append slice op desc
+                slice_starts = []
+                slice_ends = []
+                slices_axes = []
+                target_partition_index = _compute_partition_index(
+                    process, complete_shape, target_dims_mapping,
+                    target_process_shape, target_process_group)
+                for idx, item in enumerate(target_partition_index):
+                    slice_starts.append(item[0])
+                    slice_ends.append(item[1])
+                    slices_axes.append(idx)
+
+                slice_op_desc = SliceOpDesc(
+                    starts=slice_starts, ends=slice_ends, axes=slices_axes)
+                op_desc_seq[process] = [AllGatherOpDesc(group=group),
+                                        ConcatOpDesc(partition_index_list=all_partition_index_list), slice_op_desc] \
+                    if len(group) > 1 else [slice_op_desc]
+
+    return op_desc_seq
+
+
+def _insert_send_op(block, idx, tensor, dst):
+    """Insert send op into block at the given index."""
+    op_type = 'send_v2'
+    block._insert_op(
+        idx,
+        type=op_type,
+        inputs={'X': [tensor]},
+        attrs={
+            'ring_id': 0,
+            'peer': dst,
+            'use_calc_stream': True,
+        })
+
+
+def _insert_recv_op(block, idx, tensor, src):
+    """Insert recv op into block at the given index."""
+    op_type = 'recv_v2'
+    block._insert_op(
+        idx,
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': 0,
+            'peer': src,
+            'out_shape': tensor.shape,
+            'dtype': tensor.dtype,
+            'use_calc_stream': True,
+        })
+
+
+def _insert_concat_op(block, idx, tensors, axis):
+    """Insert concat op into block at the given block."""
+    inputs = {'X': tensors}
+    attrs = {}
+    attrs['axis'] = axis
+    helper = LayerHelper('concat', **locals())
+    with paddle.static.program_guard(block.program):
+        out = helper.create_variable_for_type_inference(
+            dtype=helper.input_dtype())
+    block._insert_op(
+        idx, type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+    return out
+
+
+def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name):
+    """Insert slice op into block at the given block."""
+    inputs = {'Input': tensor}
+    infer_flags = list(1 for i in range(len(axes)))
+    attrs = {
+        "axes": axes,
+        "starts": starts,
+        "ends": ends,
+        "infer_flags": infer_flags
+    }
+    helper = LayerHelper('slice', **locals())
+    out = block.create_var(
+        name=new_var_name,
+        dtype=tensor.dtype,
+        type=core.VarDesc.VarType.LOD_TENSOR)
+    block._insert_op(
+        idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs)
+    return out
+
+
+def _insert_split_op(block, idx, tensor, num_or_sections):
+    """Insert split op into block at the given index."""
+    helper = LayerHelper('split', **locals())
+    input_shape = tensor.shape
+    inputs = {'X': tensor}
+    attrs = {'num': num_or_sections, "axis": 0}
+    with paddle.static.program_guard(block.program):
+        outs = [
+            helper.create_variable_for_type_inference(
+                dtype=helper.input_dtype()) for i in range(num_or_sections)
+        ]
+    block._insert_op(
+        idx, type="split", inputs=inputs, outputs={'Out': outs}, attrs=attrs)
+    return outs
+
+
+def _insert_allgather_op(block, idx, tensor, ranks):
+    """Insert allgather op into block at the given index."""
+
+    def _insert_fill_constant_op(block, idx):
+        """Insert fill constant op into block at the given index."""
+        helper = LayerHelper("fill_constant", **locals())
+        with paddle.static.program_guard(block.program):
+            out = helper.create_variable_for_type_inference(dtype="int32")
+        inputs = {}
+        attrs = {'force_cpu': False}
+        attrs['str_value'] = str(int("1"))
+        attrs['value'] = int("1")
+        attrs['dtype'] = out.dtype
+        utils.get_shape_tensor_inputs(
+            inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant')
+        block._insert_op(
+            idx,
+            type='fill_constant',
+            inputs=inputs,
+            outputs={'Out': [out]},
+            attrs=attrs)
+        out.stop_gradient = True
+        return out
+
+    tensor_list = []
+    group = new_process_group(ranks)
+    idx_offset = 0
+
+    # instant process group before insert allgather op.
+    if not group.is_instantiate():
+        # insert fill_constant op
+        fill_constant_out = _insert_fill_constant_op(block, idx)
+        fill_constant_out.stop_gradient = True
+
+        # insert c_allreduce_sum op
+        block._insert_op(
+            idx + 1,
+            type="c_allreduce_sum",
+            inputs={'X': [fill_constant_out]},
+            outputs={'Out': [fill_constant_out]},
+            attrs={'ring_id': 0,
+                   'use_calc_stream': True})
+
+        # insert c_sync_calc_stream op
+        block._insert_op(
+            idx + 2,
+            type="c_sync_calc_stream",
+            inputs={'X': [fill_constant_out]},
+            outputs={'Out': [fill_constant_out]})
+        idx_offset = 3
+
+    # insert c_allgather op
+    op_type = 'c_allgather'
+    helper = LayerHelper(op_type, **locals())
+    with paddle.static.program_guard(block.program):
+        allgather_out = helper.create_variable_for_type_inference(
+            dtype=tensor.dtype)
+    block._insert_op(
+        idx + idx_offset,
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [allgather_out]},
+        attrs={
+            'ring_id': group.id,
+            'use_calc_stream': True,
+            'nranks': group._nranks
+        })
+    idx_offset += 1
+
+    # insert split op
+    split_out = _insert_split_op(block, idx + idx_offset, allgather_out,
+                                 group._nranks)
+    idx_offset += 1
+    tensor_list.extend(split_out)
+    return tensor_list, idx_offset
+
+
+def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index,
+                               block, idx):
+    """Concat the tensors and insert concat op."""
+    if not partition_tensor_list:
+        partition_tensor_list.append((tensor, partition_index))
+    else:
+        i = 0
+        has_concat = False
+        while i < len(partition_tensor_list):
+            concat_axis, first_order, new_partition = _compute_concat_info(
+                partition_tensor_list[i][1], partition_index)
+            if concat_axis != -1:
+                has_concat = True
+                _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \
+                    if first_order == 0 else \
+                    _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis)
+                partition_tensor_list.pop(i)
+                idx[0] += 1
+                _concat_partitions_with_op(partition_tensor_list, _,
+                                           new_partition, block, idx)
+                break
+            i += 1
+        if not has_concat:
+            partition_tensor_list.append((tensor, partition_index))
+
+
+def _init_comm_for_send_recv():
+    if not PROCESS_GROUP_MAP["global_group"].is_instantiate():
+        PROCESS_GROUP_MAP["global_group"].instantiate()
+
+
+HAS_SENT = {}
+HAS_RECV = {}
+HAS_ALLGATHER = {}
+
+
+def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op,
+                  dist_context):
+    """Parse op desc sequence and insert op in the block"""
+    global HAS_SENT
+    global HAS_RECV
+    global HAS_ALLGATHER
+    tensor_list = []
+    partition_tensor_list = []
+    if rank_id not in op_desc_seq.keys():
+        return
+    op_desc_list = op_desc_seq[rank_id]
+    block = program.global_block()
+    assert var_name in block.vars.keys(
+    ), "The {} cannot be found in the {} program.".format(var_name, rank_id)
+
+    idx = None
+    for index, op in list(enumerate(block.ops)):
+        if op.desc.id == reshard_op.desc.id:
+            idx = index
+            break
+    assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format(
+        rank_id)
+
+    matched_op = block.ops[idx]
+    source_tensor = block.vars[var_name]
+    for op_desc in op_desc_list:
+        if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
+            if var_name not in HAS_ALLGATHER.keys():
+                HAS_ALLGATHER[var_name] = []
+            if not HAS_ALLGATHER[var_name] or op_desc.group not in list(
+                    map(lambda x: x[0], HAS_ALLGATHER[var_name])):
+                tensor_list, idx_offset = _insert_allgather_op(
+                    block, idx, source_tensor, op_desc.group)
+                idx += idx_offset
+                tensor_name_list = [var.name for var in tensor_list]
+                HAS_ALLGATHER[var_name].append(
+                    [op_desc.group, tensor_name_list])
+            else:
+                for item in HAS_ALLGATHER[var_name]:
+                    if op_desc.group == item[0]:
+                        tensor_list = [
+                            program.global_block().vars[var_name]
+                            for var_name in item[1]
+                        ]
+                        break
+            assert tensor_list, "The result of parsing allgather op should not be None."
+
+        elif isinstance(op_desc, SendOpDesc):
+            _init_comm_for_send_recv()
+            if var_name not in HAS_SENT.keys():
+                HAS_SENT[var_name] = []
+            if op_desc.dst not in HAS_SENT[var_name]:
+                _insert_send_op(block, idx, source_tensor, op_desc.dst)
+                idx += 1
+                HAS_SENT[var_name].append(op_desc.dst)
+
+        elif isinstance(op_desc, RecvOpDesc):
+            _init_comm_for_send_recv()
+            if var_name not in HAS_RECV.keys():
+                HAS_RECV[var_name] = {}
+            if op_desc.src not in HAS_RECV[var_name].keys():
+                partition_index = op_desc.partition_index
+                shape = []
+                for index in partition_index:
+                    shape.append(index[1] - index[0])
+                recv_tensor = block.create_var(
+                    name=unique_name.generate(var_name + "@recv"),
+                    shape=shape,
+                    dtype=source_tensor.dtype)
+                _insert_recv_op(block, idx, recv_tensor, op_desc.src)
+                tensor_list.append(recv_tensor)
+                idx += 1
+                HAS_RECV[var_name][op_desc.src] = recv_tensor
+            else:
+                tensor_list.append(HAS_RECV[var_name][op_desc.src])
+
+        elif isinstance(op_desc, ConcatOpDesc):
+            partition_index_list = op_desc.partition_index_list
+            idx_list = [idx]
+            for index, tensor in enumerate(tensor_list):
+                _concat_partitions_with_op(partition_tensor_list, tensor,
+                                           partition_index_list[index], block,
+                                           idx_list)
+            idx = idx_list[0]
+
+        elif isinstance(op_desc, SliceOpDesc):
+            assert len(partition_tensor_list) == 1 or not partition_tensor_list
+            to_slice_tensor = partition_tensor_list[0][0] if len(
+                partition_tensor_list) == 1 else source_tensor
+            new_name = unique_name.generate(var_name + "@RESHARD")
+            target_tensor = _insert_slice_op(
+                block,
+                idx,
+                to_slice_tensor,
+                starts=op_desc.starts,
+                ends=op_desc.ends,
+                axes=op_desc.axes,
+                new_var_name=new_name)
+
+            tensor_attr = TensorDistributedAttribute(target_tensor,
+                                                     dist_context)
+            process_mesh = dist_context.get_op_distributed_attr_for_program(
+                matched_op).get_process_mesh()
+            dims_mapping = dist_context.get_op_distributed_attr_for_program(
+                matched_op).get_input_dims_mapping(var_name)
+            tensor_attr.set_dims_mapping(dims_mapping)
+            tensor_attr.set_process_mesh(process_mesh)
+            dist_context.set_tensor_distributed_attr_for_program(target_tensor,
+                                                                 tensor_attr)
+
+            # rename op input name according to new name
+            for op in block.ops:
+                for name in op.input_arg_names:
+                    op_dist_attr = dist_context.get_op_distributed_attr_for_program(
+                        op)
+                    if name == var_name and op_dist_attr is not None:
+                        op_process_mesh = op_dist_attr.get_process_mesh()
+                        op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
+                            var_name)
+                        if op_process_mesh._id == process_mesh._id and op_input_dims_mapping == dims_mapping:
+                            op.desc._rename_input(name, target_tensor.name)
+                            op_dist_attr.set_input_dims_mapping(
+                                target_tensor.name, dims_mapping)
+                            op_dist_attr._dims_mapping.pop(name, None)
+
+
+def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
+    """Remove no need ops in the main program"""
+    not_remove_op_ref = [
+        "create_py_reader", "create_double_buffer_reader", "read"
+    ]
+    remove_op_idx = []
+    block = auto_parallel_main_prog.global_block()
+    ops = block.ops
+    vars = block.vars
+    for idx, op in enumerate(ops):
+        # handle read op in the pipeline scene specially, it will be removed in the future.
+        if op.type == "read":
+            dim_list = []
+            for var_name in op.output_arg_names:
+                dim_list.extend(vars[var_name].shape)
+            for i in range(idx, -1, -1):
+                if ops[i].type == "create_py_reader":
+                    ops[i]._set_attr("shape_concat", dim_list)
+                    break
+            continue
+
+        # replace the input and output of c_sync_comm_stream op when in pipeline scene.
+        if op.type == "c_sync_comm_stream":
+            need_save = []
+            for var_name in op.input_arg_names:
+                process_mesh = dist_context.get_tensor_distributed_attr_for_program(
+                    vars[var_name]).get_process_mesh()
+                if rank_id in process_mesh.process_group:
+                    need_save.append(var_name)
+            if not need_save:
+                remove_op_idx.append(idx)
+                continue
+
+            proto = OpProtoHolder.instance().get_op_proto(op.type)
+            op.desc.set_input(proto.inputs[0].name, need_save)
+            op.desc.set_output(proto.outputs[0].name, need_save)
+            continue
+
+        # judge the other op whether should be removed.
+        op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
+        if op_dist_attr is not None:
+            op_process_mesh = op_dist_attr.get_process_mesh()
+            if rank_id not in op_process_mesh.process_group and op.type not in not_remove_op_ref:
+                remove_op_idx.append(idx)
+
+    for idx in remove_op_idx[::-1]:
+        block._remove_op(idx)
+
+
+def _remove_no_need_vars(auto_parallel_main_prog):
+    """Remove no need vars in the main program"""
+    remove_vars = set()
+    block = auto_parallel_main_prog.global_block()
+    ops = block.ops
+    vars = block.vars
+    need_vars = set()
+    for op in ops:
+        for var_name in op.input_arg_names:
+            if var_name in vars:
+                need_vars.add(var_name)
+        for var_name in op.output_arg_names:
+            if var_name in vars:
+                need_vars.add(var_name)
+    for var in vars:
+        if var not in need_vars:
+            remove_vars.add(var)
+    for var in remove_vars:
+        block._remove_var(var)
+
+
+def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id):
+    """Remove no need vars and ops in the main program."""
+    _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id)
+    _remove_no_need_vars(auto_parallel_main_prog)
+
+
+def remove_no_need_in_startup(auto_parallel_main_prog,
+                              auto_parallel_startup_prog):
+    """Remove no need vars and ops in the startup program."""
+    main_input_vars = set()
+    main_ops = auto_parallel_main_prog.global_block().ops
+    for op in main_ops:
+        for var_name in op.input_arg_names:
+            main_input_vars.add(var_name)
+
+    startup_block = auto_parallel_startup_prog.global_block()
+    startup_output_vars = set()
+    startup_ops = startup_block.ops
+    for op in startup_ops:
+        # skip c_sync_comm_stream op
+        if op.type == "c_sync_comm_stream":
+            continue
+        for var_name in op.output_arg_names:
+            startup_output_vars.add(var_name)
+
+    need_vars = set()
+    for var_name in startup_output_vars:
+        if var_name in main_input_vars:
+            need_vars.add(var_name)
+
+    startup_ops = startup_block.ops
+    actual_need_vars = set()
+    for idx, op in enumerate(startup_ops):
+        is_need_op = False
+        if op.type == "c_sync_comm_stream":
+            continue
+        for var_name in op.output_arg_names:
+            if var_name in need_vars:
+                is_need_op = True
+                break
+        if is_need_op:
+            for var_name in op.output_arg_names:
+                actual_need_vars.add(var_name)
+            for var_name in op.input_arg_names:
+                actual_need_vars.add(var_name)
+
+    remove_vars = set()
+    for var_name in startup_block.vars:
+        if var_name not in actual_need_vars:
+            remove_vars.add(var_name)
+    for var in remove_vars:
+        startup_block._remove_var(var)
+
+    remove_op_idx = []
+    vars = startup_block.vars
+    for idx, op in enumerate(startup_block.ops):
+        is_no_need_op = False
+        if op.type == "c_sync_comm_stream":
+            var_names = []
+            for var_name in op.input_arg_names:
+                if var_name in vars:
+                    var_names.append(var_name)
+            if not var_names:
+                remove_op_idx.append(idx)
+            else:
+                proto = OpProtoHolder.instance().get_op_proto(op.type)
+                op.desc.set_input(proto.inputs[0].name, var_names)
+                op.desc.set_output(proto.outputs[0].name, var_names)
+            continue
+
+        for var_name in op.output_arg_names:
+            if var_name not in vars:
+                is_no_need_op = True
+                break
+        if is_no_need_op:
+            remove_op_idx.append(idx)
+    for idx in remove_op_idx[::-1]:
+        startup_block._remove_op(idx)
+
+
+def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id,
+            dist_context):
+    """
+    Reshard tensor in the program according to its dist attr and corresponding op dist attr.
+
+    Args:
+        auto_parallel_main_prog (Program): An auto parallel main program.
+        auto_parallel_startup_prog (Program): An auto parallel startup program.
+        rank_id (int): The process id.
+    """
+    assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \
+                                         "but got {}.".format(type(auto_parallel_main_prog))
+    assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_startup_prog should be Program, " \
+                                         "but got {}.".format(type(auto_parallel_startup_prog))
+    assert isinstance(rank_id, int), "The type of rank_id should be int, " \
+                                         "but got {}.".format(type(rank_id))
+    assert isinstance(dist_context, DistributedContext), "The type of dist_context should be DistributedContext, " \
+                                         "but got {}.".format(type(dist_context))
+
+    block = auto_parallel_main_prog.global_block()
+    idx = 0
+    while idx < len(block.ops):
+        pre_op_count = len(block.ops)
+        op = block.ops[idx]
+        op_dist_attr = dist_context.get_op_distributed_attr_for_program(op)
+        if op_dist_attr is not None:
+            idx_offset = 0
+            for var_name in op.input_arg_names:
+                # skip lod_tensor_blocking_queue_0
+                if var_name == "lod_tensor_blocking_queue_0":
+                    continue
+                var = block.vars[var_name]
+                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program(
+                    var)
+                if tensor_dist_attr is not None and _need_reshard(
+                        tensor_dist_attr, op_dist_attr):
+                    reshard_op_desc = find_op_desc_seq(var, tensor_dist_attr,
+                                                       op_dist_attr)
+                    parse_op_desc(auto_parallel_main_prog, rank_id,
+                                  reshard_op_desc, var_name, op, dist_context)
+                    cur_op_count = len(block.ops)
+                    idx_offset = idx_offset + cur_op_count - pre_op_count
+                    pre_op_count = cur_op_count
+            idx = idx + idx_offset + 1
+        else:
+            idx += 1
+
+    # remove no need vars and ops in the main program
+    remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id)
+
+    # remove no need vars and ops in the startip program
+    remove_no_need_in_startup(auto_parallel_main_prog,
+                              auto_parallel_startup_prog)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 61a43aeb44e84..0c2731bc45258 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -86,6 +86,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner)
 list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp)
+list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp)
 foreach(TEST_OP ${MIXED_DIST_TEST_OPS})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
 endforeach()
@@ -225,6 +229,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp)
+    LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -589,6 +597,10 @@ if(WITH_DISTRIBUTE)
             py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
+            py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
new file mode 100644
index 0000000000000..89e9b7e817f45
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.completion import complete_backward_annotation
+from paddle.distributed.auto_parallel.reshard import reshard
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+ROOT_MESH = auto.ProcessMesh([0, 1])
+PP_MESH_0 = None
+PP_MESH_1 = None
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
+        else:
+            auto.shard_tensor(
+                self.linear0.weight, _global_process_mesh,
+                dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                self.linear1.weight, _global_process_mesh,
+                dim_mapping=[-1, -1])
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
+            auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
+        else:
+            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_dist_prog(train_program, startup_program, dist_context, rank_id):
+    global _global_process_mesh
+    dist_context.set_process_mesh(_global_process_mesh)
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    # auto completion
+    complete_train_program = auto.complete_annotation(train_program,
+                                                      dist_context)
+
+    dist_strategy = fleet.DistributedStrategy()
+    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    # logical partition
+    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
+        complete_train_program, startup_program)
+    dist_params_grads = partitioner.apply_backward(
+        loss, complete_train_program, startup_program, auto_parallel_main_prog,
+        auto_parallel_startup_prog)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
+                                         auto_parallel_main_prog,
+                                         auto_parallel_startup_prog)
+    return auto_parallel_main_prog, auto_parallel_startup_prog
+
+
+def check_backward_dist_attr(dist_context, dist_main_prog, op_need_check):
+    has_dist_attr = True
+    vars = dist_main_prog.global_block().vars
+
+    op_dist_attr = dist_context.get_op_distributed_attr_for_program(
+        op_need_check)
+    if not op_dist_attr or not op_dist_attr.get_process_mesh():
+        has_dist_attr = False
+
+    for var_name in op_need_check.input_arg_names:
+        if not op_dist_attr.get_input_dims_mapping(var_name) or \
+        not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
+        not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
+            has_dist_attr = False
+            break
+
+    if has_dist_attr:
+        for var_name in op_need_check.output_arg_names:
+            if not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \
+            not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh():
+                has_dist_attr = False
+                break
+
+    return has_dist_attr
+
+
+def check_send_recv_result(dist_main_prog, rank_id):
+    send_result = False
+    recv_result = False
+    ops = dist_main_prog.global_block().ops
+    if rank_id == 0:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
+                    0]:
+                recv_result = True
+    else:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
+                    0]:
+                recv_result = True
+
+    return send_result and recv_result
+
+
+def check_initialization(dist_startup_prog, rank_id):
+    if rank_id == 0:
+        need_check_params = [
+            "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0",
+            "linear_0.b_0"
+        ]
+    else:
+        need_check_params = ['linear_1.w_0', 'linear_1.b_0']
+
+    params = []
+    for var_name, var in dist_startup_prog.global_block().vars.items():
+        if var.is_parameter:
+            params.append(var_name)
+
+    return params == need_check_params
+
+
+def check_initialization_for_dp(dist_startup_prog):
+    need_check_params = [
+        "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", "linear_0.b_0"
+    ] + ['linear_1.w_0', 'linear_1.b_0']
+    params = []
+    for var_name, var in dist_startup_prog.global_block().vars.items():
+        if var.is_parameter:
+            params.append(var_name)
+    broadcast_varnames = []
+    for op in dist_startup_prog.global_block().ops:
+        if op.type == "c_broadcast":
+            broadcast_varnames.append(op.output_arg_names[0])
+
+    return params == need_check_params == broadcast_varnames
+
+
+class TestMLPReshard(unittest.TestCase):
+    def test_complete_backward_annotation(self):
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 0
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, 0)
+        complete_backward_annotation(dist_main_prog, dist_context)
+
+        op_need_check = None
+        for op in dist_main_prog.global_block().ops:
+            if op.type == "gelu_grad":
+                op_need_check = op
+                break
+
+        # grad op should have dist attr
+        self.assertTrue(
+            check_backward_dist_attr(dist_context, dist_main_prog,
+                                     op_need_check))
+
+    def test_mlp_pp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "pp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+        global PP_MESH_0
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
+        global PP_MESH_1
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], parent=ROOT_MESH)
+
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 1
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id)
+        complete_backward_annotation(dist_main_prog, dist_context)
+        reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+
+        # check send and recv result
+        self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
+
+        # parameter initialization of every rank should be different in the pipeline scene
+        self.assertTrue(check_initialization(dist_startup_prog, rank_id))
+
+    def test_mlp_dp(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = "dp"
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH)
+
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 0
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id)
+        complete_backward_annotation(dist_main_prog, dist_context)
+        reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+        # send and recv should not exist in dp scene.
+        self.assertFalse(check_send_recv_result(dist_main_prog, rank_id))
+
+        # all parameters should be initialized in dp scene
+        self.assertTrue(check_initialization_for_dp(dist_startup_prog))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
new file mode 100644
index 0000000000000..1e134eebfd23b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.completion import complete_backward_annotation
+from paddle.distributed.auto_parallel.reshard import reshard
+
+paddle.enable_static()
+_global_parallel_strategy = "dp_mp_pp"
+ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
+_global_process_mesh = auto.ProcessMesh(
+    [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH)
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH)
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH)
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1])
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1])
+        auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1])
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_dist_prog(train_program, startup_program, dist_context, rank_id):
+    global _global_process_mesh
+    dist_context.set_process_mesh(_global_process_mesh)
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    # auto completion
+    complete_train_program = auto.complete_annotation(train_program,
+                                                      dist_context)
+
+    dist_strategy = fleet.DistributedStrategy()
+    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    # logical partition
+    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
+        complete_train_program, startup_program)
+    dist_params_grads = partitioner.apply_backward(
+        loss, complete_train_program, startup_program, auto_parallel_main_prog,
+        auto_parallel_startup_prog)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
+                                         auto_parallel_main_prog,
+                                         auto_parallel_startup_prog)
+    return auto_parallel_main_prog, auto_parallel_startup_prog
+
+
+def check_send_recv_result(dist_main_prog, rank_id):
+    send_result = False
+    recv_result = False
+    ops = dist_main_prog.global_block().ops
+    if rank_id in [0, 1, 4, 5]:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
+                    0]:
+                recv_result = True
+    else:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
+                    0]:
+                recv_result = True
+
+    return send_result and recv_result
+
+
+def check_initialization_for_dpmppp(dist_startup_prog):
+    broadcast_varnames = []
+    for op in dist_startup_prog.global_block().ops:
+        if op.type == "c_broadcast":
+            broadcast_varnames.append(op.output_arg_names[0])
+    result = len(broadcast_varnames) > 0
+    return result
+
+
+class TestMLPReshard(unittest.TestCase):
+    def test_mlp_dpmppp(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 2
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id)
+        print(dist_main_prog)
+        complete_backward_annotation(dist_main_prog, dist_context)
+        reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+        print(dist_main_prog)
+        print(dist_startup_prog)
+        # check send and recv result
+        self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
+
+        # check parameter initialization
+        self.assertTrue(check_initialization_for_dpmppp(dist_startup_prog))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
new file mode 100644
index 0000000000000..5a10a21834570
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.context import DistributedContext
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.completion import complete_backward_annotation
+from paddle.distributed.auto_parallel.reshard import reshard
+
+paddle.enable_static()
+_global_parallel_strategy = "mp_pp"
+ROOT_MESH = auto.ProcessMesh([[0, 1], [2, 3]])
+_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], parent=ROOT_MESH)
+PP_MESH_0 = auto.ProcessMesh([0, 1], parent=ROOT_MESH)
+PP_MESH_1 = auto.ProcessMesh([2, 3], parent=ROOT_MESH)
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.word_embeddings = nn.Embedding(
+            hidden_size,
+            hidden_size,
+            weight_attr=paddle.ParamAttr(
+                name="word_embeddings",
+                initializer=nn.initializer.Normal(
+                    mean=0.0, std=initializer_range)))
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+
+    def forward(self, input):
+        auto.shard_tensor(
+            self.word_embeddings.weight, PP_MESH_0, dim_mapping=[0, -1])
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 0])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[0, -1])
+        auto.shard_tensor(self.linear2.weight, PP_MESH_1, dim_mapping=[0, -1])
+        w_out = self.word_embeddings(input)
+        out = self.linear0(w_out)
+        gelu_out = F.gelu(out, approximate=True)
+        out = self.linear1(gelu_out)
+        out1 = self.linear2(gelu_out)
+        out = out + out1
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(name="input", shape=[batch_size], dtype='int32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1])
+        auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_dist_prog(train_program, startup_program, dist_context, rank_id):
+    global _global_process_mesh
+    dist_context.set_process_mesh(_global_process_mesh)
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    # auto completion
+    complete_train_program = auto.complete_annotation(train_program,
+                                                      dist_context)
+
+    dist_strategy = fleet.DistributedStrategy()
+    partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+    # logical partition
+    auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
+        complete_train_program, startup_program)
+    dist_params_grads = partitioner.apply_backward(
+        loss, complete_train_program, startup_program, auto_parallel_main_prog,
+        auto_parallel_startup_prog)
+    optimizer = paddle.fluid.optimizer.AdamOptimizer()
+    opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads,
+                                         auto_parallel_main_prog,
+                                         auto_parallel_startup_prog)
+    return auto_parallel_main_prog, auto_parallel_startup_prog
+
+
+def check_send_recv_result(dist_main_prog, rank_id):
+    send_result = False
+    recv_result = False
+    ops = dist_main_prog.global_block().ops
+    if rank_id in [0, 1]:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
+                    0]:
+                recv_result = True
+    else:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[
+                    0]:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
+                    0]:
+                recv_result = True
+
+    return send_result and recv_result
+
+
+def check_initialization_for_mppp(dist_startup_prog, rank_id):
+    if rank_id in [0, 1]:
+        need_check_params = []
+    else:
+        need_check_params = ["linear_1.b_0", "linear_2.b_0"]
+    broadcast_varnames = []
+    for op in dist_startup_prog.global_block().ops:
+        if op.type == "c_broadcast":
+            broadcast_varnames.append(op.output_arg_names[0])
+
+    return need_check_params == broadcast_varnames
+
+
+def check_allgather(dist_main_program):
+    allgather_out = "x@RESHARD_0"
+    var_result = False
+    op_result = False
+    vars = dist_main_program.global_block().vars
+    if allgather_out in vars and vars[allgather_out].shape == (4, 4):
+        var_result = True
+    for op in dist_main_program.global_block().ops:
+        if op.type == "matmul_v2":
+            if allgather_out in op.input_arg_names:
+                op_result = True
+    return var_result and op_result
+
+
+class TestMLPReshard(unittest.TestCase):
+    def test_mlp_mppp(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = DistributedContext()
+        rank_id = 2
+        dist_main_prog, dist_startup_prog = get_dist_prog(
+            train_program, startup_program, dist_context, rank_id)
+        complete_backward_annotation(dist_main_prog, dist_context)
+        reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
+
+        # check send and recv result
+        self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
+
+        # parameter which not been sliced should be the same in the mp scene
+        self.assertTrue(
+            check_initialization_for_mppp(dist_startup_prog, rank_id))
+
+    def test_allgather(self):
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        process_mesh = auto.ProcessMesh(mesh=[0, 3], parent=ROOT_MESH)
+        with static.program_guard(train_program, startup_program):
+            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+            x = auto.shard_tensor(x, process_mesh, dim_mapping=[0, -1])
+
+            w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
+            w = auto.shard_tensor(w, process_mesh, dim_mapping=[-1, -1])
+
+            y = paddle.distributed.shard_op(paddle.matmul, process_mesh, {
+                x.name: [-1, -1],
+                w.name: [-1, -1]
+            }, **{"x": x,
+                  "y": w})[0]
+
+        rank_id = 0
+        dist_context = DistributedContext()
+        dist_strategy = fleet.DistributedStrategy()
+        partitioner = Partitioner(dist_strategy, dist_context, rank_id)
+        complete_train_program = auto.complete_annotation(train_program,
+                                                          dist_context)
+        auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward(
+            complete_train_program, startup_program)
+        reshard(auto_parallel_main_prog, startup_program, rank_id, dist_context)
+        # the x should not be slice
+        self.assertTrue(check_allgather(auto_parallel_main_prog))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
new file mode 100644
index 0000000000000..bf2ba9f061fd8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import os
+if os.getenv("CUDA_VISIBLE_DEVICES", None) is None:
+    os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+
+import paddle
+import paddle.nn as nn
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.context import get_default_distributed_context
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.partitioner import Partitioner
+from paddle.distributed.auto_parallel.reshard import reshard
+from paddle.distributed.auto_parallel.process import new_process_group
+
+paddle.enable_static()
+_global_parallel_strategy = None
+_global_process_mesh = None
+ROOT_MESH = auto.ProcessMesh([0])
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, input):
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(
+                self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1])
+        else:
+            auto.shard_tensor(
+                self.linear0.weight, _global_process_mesh,
+                dim_mapping=[-1, -1])
+            auto.shard_tensor(
+                self.linear1.weight, _global_process_mesh,
+                dim_mapping=[-1, -1])
+
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+
+        return out
+
+
+def mlp_forward(train_program, start_program):
+    with static.program_guard(train_program,
+                              start_program), utils.unique_name.guard():
+        batch_size = 4
+        hidden_size = 1024
+        sequence_len = 512
+        input = static.data(
+            name="input", shape=[batch_size, hidden_size], dtype='float32')
+        label = static.data(
+            name="label", shape=[batch_size, 1], dtype='float32')
+
+        if _global_parallel_strategy == "pp":
+            auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1])
+            auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1])
+        elif _global_parallel_strategy == "dp":
+            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1])
+        else:
+            auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1])
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            initializer_range=0.02)
+
+        predict = mlp(input)
+        error_cost = paddle.nn.functional.square_error_cost(predict, label)
+        loss = paddle.mean(error_cost)
+
+    return loss, train_program, start_program
+
+
+def get_dist_prog_with_parallelizer(train_program, startup_program,
+                                    dist_context):
+    global _global_process_mesh
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.amp = False
+    dist_strategy.pipeline = False
+    dist_strategy.recompute = False
+
+    # init parallel optimizer
+    dist_strategy.semi_auto = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+
+    loss, train_program, startup_program = mlp_forward(train_program,
+                                                       startup_program)
+
+    optimizer = paddle.fluid.optimizer.AdamOptimizer(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None)
+    optimizer = fleet.distributed_optimizer(optimizer)
+
+    # fake a comm group
+    pg = new_process_group([3, 4])
+    _, _, distributed_startup_program, distributed_main_program = optimizer.minimize(
+        loss, startup_program)
+
+    return distributed_main_program, distributed_startup_program
+
+
+def check_send_recv_result(dist_main_prog, rank_id):
+    send_result = False
+    recv_result = False
+    ops = dist_main_prog.global_block().ops
+    if rank_id == 0:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[
+                    0]:
+                recv_result = True
+    else:
+        for idx, op in enumerate(ops):
+            if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names:
+                send_result = True
+            if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[
+                    0]:
+                recv_result = True
+
+    return send_result and recv_result
+
+
+class TestMLPReshard(unittest.TestCase):
+    def test_mlp_serial(self):
+        global _global_parallel_strategy
+        _global_parallel_strategy = None
+        global _global_process_mesh
+        _global_process_mesh = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH)
+
+        train_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        dist_context = get_default_distributed_context()
+        rank_id = 0
+        dist_main_prog, dist_startup_prog = get_dist_prog_with_parallelizer(
+            train_program, startup_program, dist_context)
+        # send and recv should not exist in serial scene.
+        self.assertFalse(check_send_recv_result(dist_main_prog, rank_id))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 00245cfd2e5fe175a80d13a67b5c75e27930ce59 Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Mon, 11 Oct 2021 18:40:07 +0800
Subject: [PATCH 53/80] [Paddle-ASP] Revise 4d tensor sparsity mask pattern for
 conv2d sparsity (#36054)

Sparse tensor core for convolution requires the input channel dimension is 2:4 structed sparse.
So we have to mask the input channel dimension for using sparse tensor core
---
 python/paddle/fluid/contrib/sparsity/utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index bb030cbac1bea..a72ea4d9b8510 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -518,9 +518,13 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
         t = t.reshape(shape[0], shape[1])
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
+                                              shape[2])
+        mask = func(t, n=n, m=m)
+        return mask.reshape([shape[0], shape[1], shape[3],
+                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
     else:
         raise ValueError("The dimension of input tensor is not supported in create_mask, " \
                          "Only dimension < 4 is supported but got {}".format(len(shape)))
@@ -572,9 +576,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
         t = t.reshape(shape[0], shape[1])
     elif len(shape) == 3:
         t = t.reshape(shape[0] * shape[1], shape[2])
-    # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op
+    # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.reshape(shape[0], shape[1] * shape[2] * shape[3])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            [shape[0] * shape[1] * shape[3], shape[2]])
     else:
         raise ValueError("The dimension of input tensor is not supported in create_mask, " \
                          "Only dimension < 4 is supported but got {}".format(len(shape)))

From 1026052caa2dc18747790b002572c21970f6c6b5 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Mon, 11 Oct 2021 19:01:49 +0800
Subject: [PATCH 54/80] fix_dp_grad_merge_with_grad_clip_by_global_norm
 (#36334)

---
 python/paddle/fluid/clip.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 5a9ea1a445e2d..4cca41b527bc2 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -28,6 +28,7 @@
 from .data_feeder import check_variable_and_dtype
 from .framework import in_dygraph_mode
 from .layer_helper import LayerHelper
+from .framework import default_main_program
 
 __all__ = [
     'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
@@ -547,7 +548,12 @@ def _static_clip(self, params_grads):
                     scale_input = (scale_var.astype('float16')
                                    if g.dtype == core.VarDesc.VarType.FP16 else
                                    scale_var)
-                    p.block.append_op(
+                    # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
+                    # will be in different blocks with the gradient clip related ops.
+                    # We need to handle the correct block, otherwise will encounter
+                    # a 'NotFoundError' during compile time.
+                    block = default_main_program().current_block()
+                    block.append_op(
                         type='elementwise_mul',
                         inputs={'X': g,
                                 'Y': scale_input},

From fc5415d66859712bfdf37c2e0d330d1aa5d52679 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:18:40 +0800
Subject: [PATCH 55/80] change exit code of pip install dependencies to 5
 (#36016)

---
 paddle/scripts/paddle_build.bat | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index d675f4fdbdb61..c4528fdc75e23 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -89,7 +89,7 @@ if "%WITH_PYTHON%" == "ON" (
     pip install -r %work_dir%\python\requirements.txt --user
     if !ERRORLEVEL! NEQ 0 (
         echo pip install requirements.txt failed!
-        exit /b 7
+        exit /b 5
     )
 )
 
@@ -309,7 +309,7 @@ if %GENERATOR% == "Ninja" (
     pip install ninja
     if %errorlevel% NEQ 0 (
         echo pip install ninja failed!
-        exit /b 7
+        exit /b 5
     )
 )
 
@@ -627,7 +627,7 @@ git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
 pip install -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
     echo pip install unittest requirements.txt failed!
-    exit /b 7
+    exit /b 5
 )
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#

From eaeeb884f17d5c60f1faf4d1f26c63d14944af97 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:18:51 +0800
Subject: [PATCH 56/80] fix bug of clear third_party cache every 10 days
 (#36332)

---
 paddle/scripts/paddle_build.bat | 29 +++++++++++------------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index c4528fdc75e23..e6320d5bd154d 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -138,6 +138,17 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     rmdir %BUILD_DIR% /s/q
+
+    : clear third party cache every once in a while
+    if %day_now% EQU 21 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 11 (
+        rmdir %cache_dir%\third_party /s/q
+    )
+    if %day_now% EQU 01 (
+        rmdir %cache_dir%\third_party /s/q
+    )
     goto :mkbuild
 )
 
@@ -333,24 +344,6 @@ rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 rem clcache.exe -M 21474836480
 
 rem ------set third_party cache dir------
-: clear third party cache every once in a while
-for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
-set day_now=%datetime:~6,2%
-set day_before=-1
-set /p day_before=< %cache_dir%\day.txt
-if %day_now% NEQ %day_before% (
-    echo %day_now% > %cache_dir%\day.txt
-    type %cache_dir%\day.txt
-    if %day_now% EQU 21 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 11 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-    if %day_now% EQU 01 (
-        rmdir %cache_dir%\third_party /s/q
-    )
-)
 
 if "%WITH_TPCACHE%"=="OFF" (
     set THIRD_PARTY_PATH=%work_dir:\=/%/%BUILD_DIR%/third_party

From 830debc2da15fb42ca9a03f4d331e446248c643e Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:29:07 +0800
Subject: [PATCH 57/80] Add functor_primitives.h for kernel primtive api
 (#36203)

* Add functor_primitives.h for kernel primtive api

* update

* move namespace kps

* subFunctor init_data

* delete InvalidArgumentError
---
 .../kernel_primitives/functor_primitives.h    | 230 ++++++++++++++++++
 .../kernel_primitives/kernel_primitives.h     |   1 +
 2 files changed, 231 insertions(+)
 create mode 100644 paddle/fluid/operators/kernel_primitives/functor_primitives.h

diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
new file mode 100644
index 0000000000000..fcfcdc28b1f00
--- /dev/null
+++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h
@@ -0,0 +1,230 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace kernel_primitives {
+namespace details {
+
+static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
+  return ::Eigen::numext::exp(x);
+}
+
+static __device__ __forceinline__ float Exp(float x) { return expf(x); }
+
+static __device__ __forceinline__ double Exp(double x) { return exp(x); }
+
+static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
+  return ::Eigen::numext::log(x);
+}
+
+static __device__ __forceinline__ float Log(float x) { return logf(x); }
+
+static __device__ __forceinline__ double Log(double x) { return log(x); }
+
+}  // namespace details
+
+/******************************** Unary Functor *******************************/
+
+/**
+ * @brief Default unary exp functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct ExpFunctor {
+  HOSTDEVICE inline ExpFunctor() {}
+
+  HOSTDEVICE explicit inline ExpFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(details::Exp(x));
+  }
+};
+
+/**
+ * @brief Default unary identity functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  HOSTDEVICE inline IdentityFunctor() {}
+
+  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
+};
+
+/**
+ * @brief Default unary div functor. Divide by a constant
+ */
+template <typename Tx, typename Ty = Tx>
+struct DivideFunctor {
+  HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<Tx>(1.0f); }
+
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((Tx)(1.0 / n)) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x * n_inv);
+  }
+
+ private:
+  Tx n_inv;
+};
+
+/**
+ * @brief Default unary square functor
+ */
+template <typename Tx, typename Ty = Tx>
+struct SquareFunctor {
+  HOSTDEVICE inline SquareFunctor() {}
+
+  HOSTDEVICE explicit inline SquareFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x) * static_cast<Ty>(x);
+  }
+};
+
+/****************************** Binary Functor ********************************/
+
+/**
+ * @brief Default binary min functor
+ */
+template <typename T>
+struct MinFunctor {
+  inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary max functor
+ */
+template <typename T>
+struct MaxFunctor {
+  inline T initial() {
+    return static_cast<T>(std::numeric_limits<T>::lowest());
+  }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct AddFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b + a;
+  }
+};
+
+/**
+ * @brief Default binary add functor
+ */
+template <typename T>
+struct MulFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b * a;
+  }
+};
+
+/**
+ * @brief Default binary logic or functor
+ */
+template <typename T>
+struct LogicalOrFunctor {
+  inline T initial() { return static_cast<T>(false); }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b || a;
+  }
+};
+
+/**
+ * @brief Default binary logic and functor
+ */
+template <typename T>
+struct LogicalAndFunctor {
+  inline T initial() { return static_cast<T>(true); }
+
+  __device__ __forceinline__ T operator()(const T& a, const T& b) const {
+    return b && a;
+  }
+};
+
+/**
+ * @brief Default binary sub functor
+ */
+template <typename T>
+struct SubFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+
+/**
+ * @brief Default binary div functor
+ */
+template <typename T, typename Enable = void>
+struct DivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+template <typename T>
+struct DivFunctor<T,
+                  typename std::enable_if<std::is_integral<T>::value>::type> {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    // For int32/int64, need to check whether the divison is zero.
+    PADDLE_ENFORCE_NE(b, 0,
+                      platform::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return a / b;
+  }
+};
+
+/**
+ * @brief Default binary floor divide functor
+ */
+template <typename T>
+struct FloorDivFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const {
+    PADDLE_ENFORCE_NE(b, 0,
+                      platform::errors::InvalidArgument(
+                          "Integer division by zero encountered "
+                          "in (floor) divide. Please check the input value."));
+    return static_cast<T>(std::trunc(a / b));
+  }
+};
+
+}  // namespace kernel_primitives
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
index 45ee4fd738174..9a4f8bb026b9d 100644
--- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
+#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
 #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
 
 namespace paddle {

From a679fcbb26f9f7abb5938d4c201ef5125cd5c580 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:33:43 +0800
Subject: [PATCH 58/80] Add more tests and fix bugs for cudnn_norm_conv_test
 and cudnn_bn_and_relu_test (#36314)

---
 .../operators/fused/cudnn_bn_add_relu_test.cc | 650 +++++++++++++++---
 .../operators/fused/cudnn_norm_conv_test.cc   |  71 +-
 2 files changed, 599 insertions(+), 122 deletions(-)

diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 7229754cb8ed8..837bca6c2cf4e 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -33,6 +33,8 @@ namespace op = paddle::operators;
 using Tensor = paddle::framework::Tensor;
 
 USE_OP(batch_norm);
+USE_CUDA_ONLY_OP(fused_bn_add_activation);
+USE_CUDA_ONLY_OP(fused_bn_add_activation_grad);
 
 template <typename T>
 void InitRandomTensor(const std::vector<int64_t> &dims,
@@ -40,7 +42,7 @@ void InitRandomTensor(const std::vector<int64_t> &dims,
   T *cpu_out_ptr = cpu_out->mutable_data<T>(framework::make_ddim(dims),
                                             platform::CPUPlace());
   std::default_random_engine random(0);
-  std::uniform_real_distribution<float> dis(0.0, 1.0);
+  std::uniform_real_distribution<float> dis(-1.0, 1.0);
   for (int i = 0; i < cpu_out->numel(); ++i) {
     cpu_out_ptr[i] = static_cast<T>(dis(random));
   }
@@ -89,7 +91,7 @@ void CheckOutput(std::string name, const framework::Tensor &cpu_res,
     }
   }
   std::string error_type = is_relative_atol ? "relative" : "absolute";
-  LOG(INFO) << "[" << name << "], The dims is [" << cpu_res.dims()
+  LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims()
             << "], maximum " << error_type << " error is " << max_diff << ": "
             << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index];
 }
@@ -121,13 +123,33 @@ void ComputeSumAndSquareSum(const framework::Tensor &cpu_x,
   }
 }
 
-// get paddle batchnorm op results as baseline
+template <typename T>
+void ComputeInplaceAdd(const framework::Tensor &cpu_x,
+                       framework::Tensor *cpu_y) {
+  EXPECT_EQ(cpu_x.dims(), cpu_y->dims());
+
+  const T *cpu_x_ptr = cpu_x.data<T>();
+  T *cpu_y_ptr = cpu_y->data<T>();
+  for (int64_t i = 0; i < cpu_x.numel(); ++i) {
+    cpu_y_ptr[i] += cpu_x_ptr[i];
+  }
+}
+
+template <typename T>
+void ComputeInplaceRelu(framework::Tensor *cpu_x) {
+  T *cpu_x_ptr = cpu_x->data<T>();
+  for (int64_t i = 0; i < cpu_x->numel(); ++i) {
+    cpu_x_ptr[i] =
+        cpu_x_ptr[i] > static_cast<T>(0) ? cpu_x_ptr[i] : static_cast<T>(0);
+  }
+}
+
 void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
                              const Tensor &cpu_x, const Tensor &cpu_scale,
                              const Tensor &cpu_bias, Tensor *cpu_mean,
                              Tensor *cpu_var, Tensor *cpu_saved_mean,
                              Tensor *cpu_saved_var, Tensor *cpu_y,
-                             Tensor *cpu_reserve_space) {
+                             Tensor *saved_reserve_space) {
   framework::Scope scope;
   auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
   auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
@@ -178,68 +200,258 @@ void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx,
   TensorCopySync(*var, platform::CPUPlace(), cpu_var);
   TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
   TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
-  TensorCopySync(*reserve_space, platform::CPUPlace(), cpu_reserve_space);
+  // reserved_space will stay on GPU and used in grad op.
+  saved_reserve_space->ShareDataWith(*reserve_space);
+}
+
+void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx,
+                                  const Tensor &cpu_x, const Tensor &cpu_z,
+                                  const Tensor &cpu_scale,
+                                  const Tensor &cpu_bias, Tensor *cpu_mean,
+                                  Tensor *cpu_var, Tensor *cpu_saved_mean,
+                                  Tensor *cpu_saved_var, Tensor *cpu_y,
+                                  Tensor *saved_reserve_space) {
+  framework::Scope scope;
+  auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
+  auto *z = scope.Var("Z")->GetMutable<framework::LoDTensor>();
+  auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
+  auto *bias = scope.Var("Bias")->GetMutable<framework::LoDTensor>();
+  auto *mean = scope.Var("Mean")->GetMutable<framework::LoDTensor>();
+  auto *var = scope.Var("Variance")->GetMutable<framework::LoDTensor>();
+  auto *y = scope.Var("Y")->GetMutable<framework::LoDTensor>();
+  auto *saved_mean = scope.Var("SavedMean")->GetMutable<framework::LoDTensor>();
+  auto *saved_var =
+      scope.Var("SavedVariance")->GetMutable<framework::LoDTensor>();
+  auto *reserve_space =
+      scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorCopySync(cpu_x, place, x);
+  TensorCopySync(cpu_z, place, z);
+  TensorCopySync(cpu_scale, place, scale);
+  TensorCopySync(cpu_bias, place, bias);
+  TensorCopySync(*cpu_mean, place, mean);
+  TensorCopySync(*cpu_var, place, var);
+
+  int64_t channels = x->dims()[3];
+  scale->Resize({channels});
+  bias->Resize({channels});
+  mean->Resize({channels});
+  var->Resize({channels});
+
+  framework::AttributeMap attrs;
+
+  auto op = framework::OpRegistry::CreateOp(
+      "fused_bn_add_activation",
+      {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}},
+      {{"Y", {"Y"}},
+       {"MeanOut", {"Mean"}},
+       {"VarianceOut", {"Variance"}},
+       {"SavedMean", {"SavedMean"}},
+       {"SavedVariance", {"SavedVariance"}},
+       {"ReserveSpace", {"ReserveSpace"}}},
+      attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  TensorCopySync(*y, platform::CPUPlace(), cpu_y);
+  TensorCopySync(*mean, platform::CPUPlace(), cpu_mean);
+  TensorCopySync(*var, platform::CPUPlace(), cpu_var);
+  TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean);
+  TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var);
+  // reserved_space will stay on GPU and used in grad op.
+  saved_reserve_space->ShareDataWith(*reserve_space);
+}
+
+void ComputeFusedBNAddReluBackward(
+    const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy,
+    const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias,
+    const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var,
+    const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx,
+    Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) {
+  framework::Scope scope;
+  auto *x = scope.Var("X")->GetMutable<framework::LoDTensor>();
+  auto *y = scope.Var("Y")->GetMutable<framework::LoDTensor>();
+  auto *dy = scope.Var("Y@GRAD")->GetMutable<framework::LoDTensor>();
+  auto *scale = scope.Var("Scale")->GetMutable<framework::LoDTensor>();
+  auto *bias = scope.Var("Bias")->GetMutable<framework::LoDTensor>();
+  auto *saved_mean = scope.Var("SavedMean")->GetMutable<framework::LoDTensor>();
+  auto *saved_var =
+      scope.Var("SavedVariance")->GetMutable<framework::LoDTensor>();
+  auto *reserve_space =
+      scope.Var("ReserveSpace")->GetMutable<framework::LoDTensor>();
+  auto *dx = scope.Var("X@GRAD")->GetMutable<framework::LoDTensor>();
+  auto *dz = scope.Var("Z@GRAD")->GetMutable<framework::LoDTensor>();
+  auto *dscale = scope.Var("Scale@GRAD")->GetMutable<framework::LoDTensor>();
+  auto *dbias = scope.Var("Bias@GRAD")->GetMutable<framework::LoDTensor>();
+
+  auto place = ctx.GetPlace();
+  TensorCopySync(cpu_x, place, x);
+  TensorCopySync(cpu_y, place, y);
+  TensorCopySync(cpu_dy, place, dy);
+  TensorCopySync(cpu_scale, place, scale);
+  TensorCopySync(cpu_bias, place, bias);
+  TensorCopySync(cpu_saved_mean, place, saved_mean);
+  TensorCopySync(cpu_saved_var, place, saved_var);
+  reserve_space->ShareDataWith(saved_reserve_space);
+
+  int64_t channels = x->dims()[3];
+  scale->Resize({channels});
+  bias->Resize({channels});
+  saved_mean->Resize({channels});
+  saved_var->Resize({channels});
+
+  framework::AttributeMap attrs;
+  float momentum = 0.9;
+  float epsilon = 1e-5;
+  std::string act_type = "relu";
+  attrs.insert({"momentum", momentum});
+  attrs.insert({"epsilon", epsilon});
+  attrs.insert({"act_type", act_type});
+
+  auto op = framework::OpRegistry::CreateOp(
+      "fused_bn_add_activation_grad", {{"X", {"X"}},
+                                       {"Y", {"Y"}},
+                                       {"Y@GRAD", {"Y@GRAD"}},
+                                       {"Scale", {"Scale"}},
+                                       {"Bias", {"Bias"}},
+                                       {"SavedMean", {"SavedMean"}},
+                                       {"SavedVariance", {"SavedVariance"}},
+                                       {"ReserveSpace", {"ReserveSpace"}}},
+      {{"X@GRAD", {"X@GRAD"}},
+       {"Z@GRAD", {"Z@GRAD"}},
+       {"Scale@GRAD", {"Scale@GRAD"}},
+       {"Bias@GRAD", {"Bias@GRAD"}}},
+      attrs);
+  op->Run(scope, ctx.GetPlace());
+
+  TensorCopySync(*dx, platform::CPUPlace(), cpu_dx);
+  TensorCopySync(*dz, platform::CPUPlace(), cpu_dz);
+  TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale);
+  TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias);
 }
 
 template <typename T>
 class CudnnBNAddReluTester {
  public:
-  CudnnBNAddReluTester(int batch_size, int height, int width, int channels) {
+  CudnnBNAddReluTester(int batch_size, int height, int width, int channels,
+                       std::string act_type, bool fuse_add, bool has_shortcut) {
     batch_size_ = batch_size;
     height_ = height;
     width_ = width;
     channels_ = channels;
     ele_count_ = batch_size_ * height_ * width_;
+    act_type_ = act_type;
+    fuse_add_ = fuse_add;
+    has_shortcut_ = has_shortcut;
     SetUp();
   }
 
   ~CudnnBNAddReluTester() {}
 
   void CheckForward(float diff, bool is_relative_atol = false) {
+    LOG(INFO) << "[CheckForward, diff=" << diff
+              << ", is_relative_atol=" << is_relative_atol
+              << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_
+              << ", has_shortcut=" << has_shortcut_;
     platform::CUDADeviceContext *ctx =
         static_cast<platform::CUDADeviceContext *>(
             platform::DeviceContextPool::Instance().Get(
                 platform::CUDAPlace(0)));
 
-    framework::Tensor cpu_mean_base;
-    framework::Tensor cpu_var_base;
-    framework::Tensor cpu_saved_mean_base;
-    framework::Tensor cpu_saved_var_base;
-    framework::Tensor cpu_y_base;
-    framework::Tensor cpu_reserve_space_base;
-    BaselineForward(*ctx, &cpu_mean_base, &cpu_var_base, &cpu_saved_mean_base,
-                    &cpu_saved_var_base, &cpu_y_base, &cpu_reserve_space_base);
-
-    framework::Tensor cpu_mean;
-    framework::Tensor cpu_var;
-    framework::Tensor cpu_saved_mean;
-    framework::Tensor cpu_saved_var;
-    framework::Tensor cpu_y;
-    framework::Tensor cpu_bitmask;
-    FusedForward(*ctx, &cpu_mean, &cpu_var, &cpu_saved_mean, &cpu_saved_var,
-                 &cpu_y, &cpu_bitmask);
+    auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; };
 
-    CheckOutput<float>("Mean", cpu_mean, cpu_mean_base, diff, is_relative_atol);
-    CheckOutput<float>("Variance", cpu_var, cpu_var_base, diff,
+    framework::Tensor cpu_mean_base_x;
+    framework::Tensor cpu_var_base_x;
+    framework::Tensor cpu_mean_base_z;
+    framework::Tensor cpu_var_base_z;
+    if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) {
+      BaselineForwardFusedBNAddRelu(
+          *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_,
+          &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_);
+    } else {
+      BaselineForward(
+          *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_,
+          &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_,
+          select(&cpu_mean_base_z), select(&cpu_var_base_z),
+          select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_),
+          select(&saved_reserve_space_z_));
+    }
+
+    framework::Tensor cpu_mean_x;
+    framework::Tensor cpu_var_x;
+    framework::Tensor cpu_y;
+    framework::Tensor cpu_mean_z;
+    framework::Tensor cpu_var_z;
+    FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_,
+                 &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z),
+                 select(&cpu_var_z), select(&cpu_saved_mean_z_),
+                 select(&cpu_saved_var_z_));
+
+    CheckOutput<float>("Mean", cpu_mean_x, cpu_mean_base_x, diff,
+                       is_relative_atol);
+    CheckOutput<float>("Variance", cpu_var_x, cpu_var_base_x, diff,
                        is_relative_atol);
-    CheckOutput<float>("SavedMean", cpu_saved_mean, cpu_saved_mean_base, diff,
+    CheckOutput<float>("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_,
+                       diff, is_relative_atol);
+    CheckOutput<float>("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_,
+                       diff, is_relative_atol);
+    if (has_shortcut_) {
+      CheckOutput<float>("MeanZ", cpu_mean_z, cpu_mean_base_z, diff,
+                         is_relative_atol);
+      CheckOutput<float>("VarianceZ", cpu_var_z, cpu_var_base_z, diff,
+                         is_relative_atol);
+      CheckOutput<float>("SavedMeanZ", cpu_saved_mean_z_,
+                         cpu_saved_mean_base_z_, diff, is_relative_atol);
+      CheckOutput<float>("SavedVarianceZ", cpu_saved_var_z_,
+                         cpu_saved_var_base_z_, diff, is_relative_atol);
+    }
+    CheckOutput<T>("Y", cpu_y, cpu_y_base_, diff, is_relative_atol);
+  }
+
+  void CheckBackward(float diff, bool is_relative_atol = false) {
+    platform::CUDADeviceContext *ctx =
+        static_cast<platform::CUDADeviceContext *>(
+            platform::DeviceContextPool::Instance().Get(
+                platform::CUDAPlace(0)));
+
+    framework::Tensor cpu_dx_base;
+    framework::Tensor cpu_dz_base;
+    framework::Tensor cpu_dscale_base;
+    framework::Tensor cpu_dbias_base;
+    BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base,
+                                   &cpu_dscale_base, &cpu_dbias_base);
+
+    framework::Tensor cpu_dx;
+    framework::Tensor cpu_dz;
+    framework::Tensor cpu_dscale;
+    framework::Tensor cpu_dbias;
+    FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias);
+
+    CheckOutput<T>("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol);
+    CheckOutput<T>("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol);
+    CheckOutput<float>("DScale", cpu_dscale, cpu_dscale_base, diff,
                        is_relative_atol);
-    CheckOutput<float>("SavedVariance", cpu_saved_var, cpu_saved_var_base, diff,
+    CheckOutput<float>("DBias", cpu_dbias, cpu_dbias_base, diff,
                        is_relative_atol);
-    CheckOutput<T>("Y", cpu_y, cpu_y_base, diff, is_relative_atol);
   }
 
  private:
   void SetUp() {
-    // Initialize input data
     InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_x_);
-    ComputeSumAndSquareSum<T>(cpu_x_, &cpu_sum_, &cpu_sum_of_square_);
+    InitRandomTensor<float>({channels_}, &cpu_bn_scale_x_);
+    InitRandomTensor<float>({channels_}, &cpu_bn_bias_x_);
 
-    // scale and bias should be initialized randomly.
-    InitConstantTensor<float>({channels_}, static_cast<float>(1.0f),
-                              &cpu_bn_scale_);
-    InitConstantTensor<float>({channels_}, static_cast<float>(0.0f),
-                              &cpu_bn_bias_);
+    if (has_shortcut_) {
+      InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_z_);
+      InitRandomTensor<float>({channels_}, &cpu_bn_scale_z_);
+      InitRandomTensor<float>({channels_}, &cpu_bn_bias_z_);
+    } else {
+      if (fuse_add_) {
+        InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_z_);
+      }
+    }
+
+    InitRandomTensor<T>({batch_size_, height_, width_, channels_}, &cpu_dy_);
   }
 
   void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean,
@@ -252,71 +464,178 @@ class CudnnBNAddReluTester {
                               cpu_saved_var);
   }
 
-  void BaselineForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean,
-                       Tensor *cpu_var, Tensor *cpu_saved_mean,
-                       Tensor *cpu_saved_var, Tensor *cpu_y,
-                       Tensor *cpu_reserve_space) {
+  void BaselineForward(const platform::CUDADeviceContext &ctx,
+                       Tensor *cpu_mean_x, Tensor *cpu_var_x,
+                       Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x,
+                       Tensor *cpu_y, Tensor *saved_reserve_space_x,
+                       Tensor *cpu_mean_z = nullptr,
+                       Tensor *cpu_var_z = nullptr,
+                       Tensor *cpu_saved_mean_z = nullptr,
+                       Tensor *cpu_saved_var_z = nullptr,
+                       Tensor *saved_reserve_space_z = nullptr) {
+    InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
+    ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
+                            cpu_mean_x, cpu_var_x, cpu_saved_mean_x,
+                            cpu_saved_var_x, cpu_y, saved_reserve_space_x);
+    if (has_shortcut_) {
+      framework::Tensor cpu_z_out;
+      InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
+      ComputeBatchNormForward(
+          ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z,
+          cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z);
+      ComputeInplaceAdd<T>(cpu_z_out, cpu_y);
+    } else {
+      if (fuse_add_) {
+        ComputeInplaceAdd<T>(cpu_z_, cpu_y);
+      }
+    }
+    if (act_type_ == "relu") {
+      ComputeInplaceRelu<T>(cpu_y);
+    }
+  }
+
+  void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
+                                     Tensor *cpu_mean, Tensor *cpu_var,
+                                     Tensor *cpu_saved_mean,
+                                     Tensor *cpu_saved_var, Tensor *cpu_y,
+                                     Tensor *saved_reserve_space) {
     InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
-    ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_, cpu_bn_bias_, cpu_mean,
-                            cpu_var, cpu_saved_mean, cpu_saved_var, cpu_y,
-                            cpu_reserve_space);
+    ComputeFusedBNAddReluForward(
+        ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var,
+        cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space);
+  }
+
+  void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx,
+                                      Tensor *cpu_dx, Tensor *cpu_dz,
+                                      Tensor *cpu_dscale, Tensor *cpu_dbias) {
+    ComputeFusedBNAddReluBackward(
+        ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
+        cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_,
+        saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias);
+  }
+
+  void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx,
+                                   const Tensor &cpu_x,
+                                   const Tensor &cpu_bn_scale,
+                                   const Tensor &cpu_bn_bias, Tensor *sum,
+                                   Tensor *sum_of_square, Tensor *bn_scale,
+                                   Tensor *bn_bias, Tensor *mean, Tensor *var,
+                                   Tensor *saved_mean, Tensor *saved_var,
+                                   Tensor *equiv_scale, Tensor *equiv_bias) {
+    framework::Tensor cpu_sum;
+    framework::Tensor cpu_sum_of_square;
+    ComputeSumAndSquareSum<T>(cpu_x, &cpu_sum, &cpu_sum_of_square);
+
+    auto place = ctx.GetPlace();
+    TensorCopySync(cpu_sum, place, sum);
+    TensorCopySync(cpu_sum_of_square, place, sum_of_square);
+    TensorCopySync(cpu_bn_scale, place, bn_scale);
+    TensorCopySync(cpu_bn_bias, place, bn_bias);
+
+    bn_scale->Resize({1, 1, 1, channels_});
+    bn_bias->Resize({1, 1, 1, channels_});
+
+    // input
+    float *sum_ptr = sum->data<float>();
+    float *sum_of_square_ptr = sum_of_square->data<float>();
+    float *bn_scale_ptr = bn_scale->data<float>();
+    float *bn_bias_ptr = bn_bias->data<float>();
+
+    mean->Resize({1, 1, 1, channels_});
+    var->Resize({1, 1, 1, channels_});
+
+    // output
+    float *mean_ptr = mean->data<float>();
+    float *var_ptr = var->data<float>();
+    float *saved_mean_ptr =
+        saved_mean->mutable_data<float>({1, 1, 1, channels_}, place);
+    float *saved_var_ptr =
+        saved_var->mutable_data<float>({1, 1, 1, channels_}, place);
+    T *equiv_scale_ptr =
+        equiv_scale->mutable_data<T>({1, 1, 1, channels_}, place);
+    T *equiv_bias_ptr =
+        equiv_bias->mutable_data<T>({1, 1, 1, channels_}, place);
+
+    auto param_shape = framework::vectorize<int>(bn_scale->dims());
+    op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape);
+    bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr,
+                  saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr,
+                  equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_,
+                  true);
   }
 
   // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
-  void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean,
-                    Tensor *cpu_var, Tensor *cpu_saved_mean,
-                    Tensor *cpu_saved_var, Tensor *cpu_y, Tensor *cpu_bitmask) {
+  void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x,
+                    Tensor *cpu_var_x, Tensor *cpu_saved_mean_x,
+                    Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask,
+                    Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr,
+                    Tensor *cpu_saved_mean_z = nullptr,
+                    Tensor *cpu_saved_var_z = nullptr) {
     framework::Tensor x;
-    framework::Tensor sum;
-    framework::Tensor sum_of_square;
-    framework::Tensor bn_scale;
-    framework::Tensor bn_bias;
+    framework::Tensor sum_x;
+    framework::Tensor sum_of_square_x;
+    framework::Tensor bn_scale_x;
+    framework::Tensor bn_bias_x;
+
+    framework::Tensor z;
+    framework::Tensor sum_z;
+    framework::Tensor sum_of_square_z;
+    framework::Tensor bn_scale_z;
+    framework::Tensor bn_bias_z;
 
     auto place = ctx.GetPlace();
     TensorCopySync(cpu_x_, place, &x);
-    TensorCopySync(cpu_sum_, place, &sum);
-    TensorCopySync(cpu_sum_of_square_, place, &sum_of_square);
-    TensorCopySync(cpu_bn_scale_, place, &bn_scale);
-    TensorCopySync(cpu_bn_bias_, place, &bn_bias);
+    if (fuse_add_ || has_shortcut_) {
+      TensorCopySync(cpu_z_, place, &z);
+    }
 
-    bn_scale.Resize({1, 1, 1, channels_});
-    bn_bias.Resize({1, 1, 1, channels_});
+    framework::Tensor mean_x;
+    framework::Tensor var_x;
+    framework::Tensor saved_mean_x;
+    framework::Tensor saved_var_x;
+    framework::Tensor equiv_scale_x;
+    framework::Tensor equiv_bias_x;
 
-    T *x_ptr = x.data<T>();
-    float *sum_ptr = sum.data<float>();
-    float *sum_of_square_ptr = sum_of_square.data<float>();
-    float *bn_scale_ptr = bn_scale.data<float>();
-    float *bn_bias_ptr = bn_bias.data<float>();
+    framework::Tensor mean_z;
+    framework::Tensor var_z;
+    framework::Tensor saved_mean_z;
+    framework::Tensor saved_var_z;
+    framework::Tensor equiv_scale_z;
+    framework::Tensor equiv_bias_z;
 
-    framework::Tensor mean;
-    framework::Tensor var;
-    framework::Tensor saved_mean;
-    framework::Tensor saved_var;
-    framework::Tensor equiv_scale;
-    framework::Tensor equiv_bias;
     framework::Tensor y;
     framework::Tensor bitmask;
 
-    InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var);
-    TensorCopySync(*cpu_mean, place, &mean);
-    TensorCopySync(*cpu_var, place, &var);
+    InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x);
+    TensorCopySync(*cpu_mean_x, place, &mean_x);
+    TensorCopySync(*cpu_var_x, place, &var_x);
+    if (has_shortcut_) {
+      InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z);
+      TensorCopySync(*cpu_mean_z, place, &mean_z);
+      TensorCopySync(*cpu_var_z, place, &var_z);
+    }
 
-    mean.Resize({1, 1, 1, channels_});
-    var.Resize({1, 1, 1, channels_});
+    // 1. BN Stats Finalize
+    ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_,
+                                &sum_x, &sum_of_square_x, &bn_scale_x,
+                                &bn_bias_x, &mean_x, &var_x, &saved_mean_x,
+                                &saved_var_x, &equiv_scale_x, &equiv_bias_x);
+    if (has_shortcut_) {
+      ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_,
+                                  &sum_z, &sum_of_square_z, &bn_scale_z,
+                                  &bn_bias_z, &mean_z, &var_z, &saved_mean_z,
+                                  &saved_var_z, &equiv_scale_z, &equiv_bias_z);
+    }
 
-    float *mean_ptr = mean.data<float>();
-    float *var_ptr = var.data<float>();
-    float *saved_mean_ptr =
-        saved_mean.mutable_data<float>({1, 1, 1, channels_}, place);
-    float *saved_var_ptr =
-        saved_var.mutable_data<float>({1, 1, 1, channels_}, place);
-    T *equiv_scale_ptr =
-        equiv_scale.mutable_data<T>({1, 1, 1, channels_}, place);
-    T *equiv_bias_ptr = equiv_bias.mutable_data<T>({1, 1, 1, channels_}, place);
+    T *x_ptr = x.data<T>();
+    T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data<T>() : nullptr;
+    T *equiv_scale_x_ptr = equiv_scale_x.data<T>();
+    T *equiv_bias_x_ptr = equiv_bias_x.data<T>();
+    T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data<T>() : nullptr;
+    T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data<T>() : nullptr;
     T *y_ptr =
         y.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
 
-    // bitmask
     int c = channels_;
     int64_t nhw = ele_count_;
     int32_t c_int32_elems = ((c + 63) & ~63) / 32;
@@ -325,31 +644,90 @@ class CudnnBNAddReluTester {
         {nhw_int32_elems, c_int32_elems, 1}, place);
 
     auto data_shape = framework::vectorize<int>(x.dims());
-    auto param_shape = framework::vectorize<int>(bn_scale.dims());
+    auto param_shape = framework::vectorize<int>(bn_scale_x.dims());
     auto bitmask_shape = framework::vectorize<int>(bitmask.dims());
 
-    // 1. BN Stats Finalize
-    op::CudnnBNStatsFinalize<T> bn_op(ctx, param_shape);
-    bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr,
-                  saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr,
-                  equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_,
-                  true);
-
-    // 2. Scale Bias + Relu (not fused add)
-    std::string act_type = "";
-    op::CudnnScaleBiasAddRelu<T> sbar_op(
-        ctx, act_type, false, false, data_shape, param_shape, bitmask_shape);
-    sbar_op.Forward(ctx, x_ptr, equiv_scale_ptr, equiv_bias_ptr, y_ptr,
-                    bitmask_ptr);
-
-    TensorCopySync(mean, platform::CPUPlace(), cpu_mean);
-    TensorCopySync(var, platform::CPUPlace(), cpu_var);
-    TensorCopySync(saved_mean, platform::CPUPlace(), cpu_saved_mean);
-    TensorCopySync(saved_var, platform::CPUPlace(), cpu_saved_var);
+    // 2. Scale Bias + Relu
+    op::CudnnScaleBiasAddRelu<T> sbar_op(ctx, act_type_, fuse_add_,
+                                         has_shortcut_, data_shape, param_shape,
+                                         bitmask_shape);
+    sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr,
+                    bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr);
+
+    TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x);
+    TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x);
+    TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x);
+    TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x);
+    if (has_shortcut_) {
+      TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z);
+      TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z);
+      TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z);
+      TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z);
+    }
     TensorCopySync(y, platform::CPUPlace(), cpu_y);
     TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask);
   }
 
+  // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu
+  void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx,
+                     Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) {
+    framework::Tensor dy;
+    framework::Tensor x;
+    framework::Tensor bn_scale;
+    framework::Tensor bn_bias;
+    framework::Tensor saved_mean;
+    framework::Tensor saved_var;
+    framework::Tensor bitmask;
+    framework::Tensor dx;
+    framework::Tensor dz;
+    framework::Tensor dscale;
+    framework::Tensor dbias;
+
+    auto place = ctx.GetPlace();
+    TensorCopySync(cpu_dy_, place, &dy);
+    TensorCopySync(cpu_x_, place, &x);
+    TensorCopySync(cpu_bn_scale_x_, place, &bn_scale);
+    TensorCopySync(cpu_bn_bias_x_, place, &bn_bias);
+    TensorCopySync(cpu_saved_mean_x_, place, &saved_mean);
+    TensorCopySync(cpu_saved_var_x_, place, &saved_var);
+    TensorCopySync(cpu_bitmask_, place, &bitmask);
+
+    bn_scale.Resize({1, 1, 1, channels_});
+    bn_bias.Resize({1, 1, 1, channels_});
+    saved_mean.Resize({1, 1, 1, channels_});
+    saved_var.Resize({1, 1, 1, channels_});
+
+    T *dy_ptr = dy.data<T>();
+    T *x_ptr = x.data<T>();
+    float *bn_scale_ptr = bn_scale.data<float>();
+    float *bn_bias_ptr = bn_bias.data<float>();
+    float *saved_mean_ptr = saved_mean.data<float>();
+    float *saved_var_ptr = saved_var.data<float>();
+    int32_t *bitmask_ptr = bitmask.data<int32_t>();
+    T *dx_ptr =
+        dx.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
+    T *dz_ptr =
+        dz.mutable_data<T>({batch_size_, height_, width_, channels_}, place);
+    float *dscale_ptr = dscale.mutable_data<float>({1, 1, 1, channels_}, place);
+    float *dbias_ptr = dbias.mutable_data<float>({1, 1, 1, channels_}, place);
+
+    auto data_shape = framework::vectorize<int>(x.dims());
+    auto param_shape = framework::vectorize<int>(bn_scale.dims());
+    auto bitmask_shape = framework::vectorize<int>(bitmask.dims());
+
+    std::string act_type = "relu";
+    op::CudnnScaleBiasAddRelu<T> sbar_op(ctx, act_type, true, false, data_shape,
+                                         param_shape, bitmask_shape);
+    sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr,
+                     saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr,
+                     dscale_ptr, dbias_ptr, eps_);
+
+    TensorCopySync(dx, platform::CPUPlace(), cpu_dx);
+    TensorCopySync(dz, platform::CPUPlace(), cpu_dz);
+    TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale);
+    TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias);
+  }
+
  private:
   int batch_size_;
   int height_;
@@ -357,24 +735,80 @@ class CudnnBNAddReluTester {
   int channels_;
   int ele_count_;
 
+  std::string act_type_;
+  bool fuse_add_;
+  bool has_shortcut_;
+
   // Forward input
   framework::Tensor cpu_x_;
-  framework::Tensor cpu_sum_;
-  framework::Tensor cpu_sum_of_square_;
-  framework::Tensor cpu_bn_scale_;
-  framework::Tensor cpu_bn_bias_;
+  framework::Tensor cpu_bn_scale_x_;
+  framework::Tensor cpu_bn_bias_x_;
+  framework::Tensor cpu_z_;
+  framework::Tensor cpu_bn_scale_z_;
+  framework::Tensor cpu_bn_bias_z_;
+
+  // Backward input
+  framework::Tensor cpu_dy_;
+  framework::Tensor cpu_bitmask_;
+  framework::Tensor cpu_saved_mean_x_;
+  framework::Tensor cpu_saved_var_x_;
+  framework::Tensor cpu_saved_mean_z_;
+  framework::Tensor cpu_saved_var_z_;
+  framework::Tensor cpu_saved_mean_base_x_;
+  framework::Tensor cpu_saved_var_base_x_;
+  framework::Tensor saved_reserve_space_x_;
+  framework::Tensor cpu_saved_mean_base_z_;
+  framework::Tensor cpu_saved_var_base_z_;
+  framework::Tensor saved_reserve_space_z_;
+  framework::Tensor cpu_y_base_;
 
   double eps_ = 1e-5;
   float momentum_ = 0.9;
 };
 
-TEST(CudnnBNAddReluForward, GPUCudnnBNAddReluForwardFp16) {
+TEST(CudnnBNAddReluFp16, BNAdd) {
+  int batch_size = 4;
+  int height = 8;
+  int width = 8;
+  int channels = 64;
+  std::string act_type = "";
+  bool has_shortcut = false;
+  FLAGS_cudnn_batchnorm_spatial_persistent = true;
+  for (auto fuse_add : {false, true}) {
+    CudnnBNAddReluTester<paddle::platform::float16> test(
+        batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
+    test.CheckForward(2e-3);
+  }
+}
+
+TEST(CudnnBNAddReluFp16, BNAddRelu) {
+  int batch_size = 4;
+  int height = 8;
+  int width = 8;
+  int channels = 64;
+  std::string act_type = "relu";
+  bool has_shortcut = false;
+  FLAGS_cudnn_batchnorm_spatial_persistent = true;
+  for (auto fuse_add : {false, true}) {
+    CudnnBNAddReluTester<paddle::platform::float16> test(
+        batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
+    test.CheckForward(2e-3);
+    if (fuse_add) {
+      test.CheckBackward(2e-4);
+    }
+  }
+}
+
+TEST(CudnnBNAddReluFp16, HasShortcut) {
   int batch_size = 4;
   int height = 8;
   int width = 8;
   int channels = 64;
+  std::string act_type = "";
+  bool fuse_add = false;
+  bool has_shortcut = true;
   FLAGS_cudnn_batchnorm_spatial_persistent = true;
-  CudnnBNAddReluTester<paddle::platform::float16> test(batch_size, height,
-                                                       width, channels);
-  test.CheckForward(2e-3);
+  CudnnBNAddReluTester<paddle::platform::float16> test(
+      batch_size, height, width, channels, act_type, fuse_add, has_shortcut);
+  test.CheckForward(5e-3);
 }
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index fff7b327f3f2e..4c14029b99c69 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -92,10 +92,9 @@ void CheckOutput(const framework::Tensor &cpu_res,
 }
 
 // Use Paddle conv2d op results as baseline
-template <typename T>
 void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
                           const Tensor &cpu_input, const Tensor &cpu_filter,
-                          Tensor *cpu_output) {
+                          Tensor *cpu_output, int stride, int padding) {
   framework::Scope scope;
   auto *input = scope.Var("Input")->GetMutable<framework::LoDTensor>();
   auto *filter = scope.Var("Filter")->GetMutable<framework::LoDTensor>();
@@ -108,10 +107,12 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
   framework::AttributeMap attrs;
   bool use_cudnn = true;
   std::string data_format = "NHWC";
-  std::string padding_algorithm = "SAME";
+  std::vector<int> strides = {stride, stride};
+  std::vector<int> paddings = {padding, padding};
+  attrs.insert({"strides", strides});
+  attrs.insert({"paddings", paddings});
   attrs.insert({"use_cudnn", use_cudnn});
   attrs.insert({"data_format", data_format});
-  attrs.insert({"padding_algorithm", padding_algorithm});
 
   auto op = framework::OpRegistry::CreateOp(
       "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}},
@@ -122,7 +123,6 @@ void ComputeConv2DForward(const platform::CUDADeviceContext &ctx,
 }
 
 // Use Paddle conv2d_grad op results as baseline
-template <typename T>
 void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
                            const Tensor &cpu_input, const Tensor &cpu_filter,
                            const Tensor &cpu_output_grad,
@@ -147,7 +147,7 @@ void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx,
   framework::AttributeMap attrs;
   bool use_cudnn = true;
   std::string data_format = "NHWC";
-  std::string padding_algorithm = "SAME";
+  std::string padding_algorithm = "EXPLICIT";
   std::vector<int> strides = {stride, stride};
   std::vector<int> paddings = {padding, padding};
   std::vector<int> dilations = {dilation, dilation};
@@ -216,6 +216,8 @@ class CudnnNormConvolutionTester {
     kernel_size_ = kernel_size;
     stride_ = stride;
     padding_ = (kernel_size_ - 1) / 2;
+    out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1;
+    out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1;
     SetUp();
   }
 
@@ -227,6 +229,15 @@ class CudnnNormConvolutionTester {
             platform::DeviceContextPool::Instance().Get(
                 platform::CUDAPlace(0)));
 
+    if (!Support(*ctx)) {
+      LOG(INFO)
+          << "Current test is only supported in the platforms with "
+          << "compatiblity greater than or equal to 70 and the kernel size "
+          << "must be equal to 1 or 3. Besides, when the kernel size is 1, "
+          << "the stride must be 1 if the compatiblity is equal to 70.";
+      return;
+    }
+
     framework::Tensor cpu_output_base;
     framework::Tensor cpu_sum_base;
     framework::Tensor cpu_sum_of_square_base;
@@ -277,15 +288,17 @@ class CudnnNormConvolutionTester {
         &cpu_filter_nchw_);
     // transpoes for filter, NCHW -> NHWC
     TransposeNchwToNhwc<T>(cpu_filter_nchw_, &cpu_filter_nhwc_);
-    InitRandomTensor<T>({batch_size_, height_, width_, output_channels_},
-                        &cpu_output_grad_);
+    InitRandomTensor<T>(
+        {batch_size_, out_height_, out_width_, output_channels_},
+        &cpu_output_grad_);
   }
 
   void BaselineForward(const platform::CUDADeviceContext &ctx,
                        framework::Tensor *cpu_output_base,
                        framework::Tensor *cpu_sum_base,
                        framework::Tensor *cpu_sum_of_square_base) {
-    ComputeConv2DForward<T>(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base);
+    ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base,
+                         stride_, padding_);
     ComputeSumAndSquareSum<T>(*cpu_output_base, cpu_sum_base,
                               cpu_sum_of_square_base);
   }
@@ -293,10 +306,9 @@ class CudnnNormConvolutionTester {
   void BaselineBackward(const platform::CUDADeviceContext &ctx,
                         framework::Tensor *cpu_input_grad_base,
                         framework::Tensor *cpu_filter_grad_base) {
-    ComputeConv2DBackward<T>(ctx, cpu_input_, cpu_filter_nchw_,
-                             cpu_output_grad_, cpu_input_grad_base,
-                             cpu_filter_grad_base, stride_, padding_,
-                             dilation_);
+    ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_,
+                          cpu_input_grad_base, cpu_filter_grad_base, stride_,
+                          padding_, dilation_);
   }
 
   // get forward results of cudnn_norm_conv
@@ -316,7 +328,7 @@ class CudnnNormConvolutionTester {
     T *input_ptr = input.data<T>();
     T *filter_ptr = filter_nhwc.data<T>();
     T *output_ptr = output.mutable_data<T>(
-        {batch_size_, height_, width_, output_channels_}, place);
+        {batch_size_, out_height_, out_width_, output_channels_}, place);
     float *sum_ptr =
         sum.mutable_data<float>({1, 1, 1, output_channels_}, place);
     float *sum_of_square_ptr =
@@ -369,10 +381,25 @@ class CudnnNormConvolutionTester {
     TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad);
   }
 
+  bool Support(const platform::CUDADeviceContext &ctx) {
+    if (ctx.GetComputeCapability() == 70) {
+      if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) {
+        return true;
+      }
+    } else if (ctx.GetComputeCapability() > 70) {
+      if ((kernel_size_ == 3) || (kernel_size_ == 1)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
  private:
   int batch_size_;
   int height_;
   int width_;
+  int out_height_;
+  int out_width_;
   int input_channels_;
   int output_channels_;
   int kernel_size_;
@@ -437,3 +464,19 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   test.CheckForward(1e-3, true);
   test.CheckBackward(1e-3, true);
 }
+
+// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4
+TEST(CudnnNormConvFp16, K1S2O4) {
+  int batch_size = 4;
+  int height = 8;
+  int width = 8;
+  int input_channels = 32;
+  int output_channels = 128;
+  int kernel_size = 1;
+  int stride = 2;
+  CudnnNormConvolutionTester<paddle::platform::float16> test(
+      batch_size, height, width, input_channels, output_channels, kernel_size,
+      stride);
+  test.CheckForward(1e-3, true);
+  test.CheckBackward(1e-3);
+}

From 14393876fca754330fe68e7c244a8d81d863b5a9 Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Mon, 11 Oct 2021 13:43:07 +0200
Subject: [PATCH 59/80] added missing bf16 ops (#36291)

---
 .../framework/ir/graph_pattern_detector.cc    | 37 ++++++++++++++-----
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 4150d0ca555c9..449849762cb10 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2263,15 +2263,34 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>(
-          {"concat",          "conv2d",          "conv2d_transpose",
-           "elementwise_add", "elementwise_mul", "fc",
-           "fusion_gru",      "fusion_lstm",     "gelu",
-           "layer_norm",      "matmul",          "matmul_v2",
-           "pool2d",          "prelu",           "relu",
-           "reshape2",        "softmax",         "split",
-           "squeeze",         "squeeze2",        "sum",
-           "transpose2"});
+      std::unordered_set<std::string>({"cast",
+                                       "clip",
+                                       "concat",
+                                       "conv2d",
+                                       "conv2d_transpose",
+                                       "elementwise_add",
+                                       "elementwise_mul",
+                                       "expand_v2",
+                                       "fc",
+                                       "fusion_gru",
+                                       "fusion_lstm",
+                                       "gelu",
+                                       "layer_norm",
+                                       "matmul",
+                                       "matmul_v2",
+                                       "pool2d",
+                                       "prelu",
+                                       "relu",
+                                       "reshape2",
+                                       "scale",
+                                       "sigmoid",
+                                       "slice",
+                                       "softmax",
+                                       "split",
+                                       "squeeze",
+                                       "squeeze2",
+                                       "sum",
+                                       "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }

From 85b77232768b53ee3db2f86653eeeedccbf570d1 Mon Sep 17 00:00:00 2001
From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:53:53 +0800
Subject: [PATCH 60/80] Add nn.functional.sparse_attention and some test cases,
 test=develop (#35757)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add paddle.nn.functional.sparse_attention API

    本个PR主要将sparse_attention功能在python层进行了一层封装，OP的主体代码见：#PR35676

    此外，对于封装的python 接口，增加了相应的单测。
---
 paddle/fluid/operators/CMakeLists.txt         |   2 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   5 +
 .../unittests/test_sparse_attention_op.py     | 151 +++++++++++++++---
 python/paddle/nn/functional/__init__.py       |   3 +
 .../paddle/nn/functional/sparse_attention.py  | 144 +++++++++++++++++
 5 files changed, 285 insertions(+), 20 deletions(-)
 create mode 100644 python/paddle/nn/functional/sparse_attention.py

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index c487313f91c58..b910b4ec73901 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -94,7 +94,7 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
     op_library(sync_batch_norm_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
-    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
+    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
         op_library(sparse_attention_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n")
     endif()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0c2731bc45258..9d6a1d00cff60 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -464,6 +464,11 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 
+# disable sparse_attention which not in suitable env
+if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) )
+    list(REMOVE_ITEM TEST_OPS test_sparse_attention_op)
+endif()
+
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
   list(REMOVE_ITEM TEST_OPS test_dataset_dataloader)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
index 48401fb55ef3f..5134b885f3307 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -16,10 +16,13 @@
 import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
+from paddle.static import Program, program_guard
 import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.nn.functional as F
 import os
 import re
-import platform
 
 
 def get_cuda_version():
@@ -34,22 +37,6 @@ def get_cuda_version():
         return -1
 
 
-def get_linux_platform():
-    if platform.system().lower() == 'windows':
-        return 0
-    elif platform.system().lower() == 'linux':
-        return 1
-    else:
-        return -1
-
-
-def get_suitable_env():
-    if get_cuda_version() >= 11020 and get_linux_platform() == 1:
-        return True
-    else:
-        return False
-
-
 def softmax(x):
     max = np.max(x, axis=1, keepdims=True)
     e_x = np.exp(x - max)
@@ -141,8 +128,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_suitable_env() == False,
-    "core is not compiled with CUDA and cuda version need >= 11.2 in windows")
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+)
 class TestSparseAttentionOp(OpTest):
     def config(self):
         self.shape = (1, 1, 16, 8)
@@ -201,5 +189,130 @@ def config(self):
         self.dtype = "float64"
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.2"
+)
+class TestSparseAttentionAPI(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (1, 1, 8, 4)
+        self.blocksize = 2
+        self.dtype = 'float64'
+
+    def test_static_graph(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            Q = paddle.static.data(name="Q", shape=self.shape, dtype=self.dtype)
+            K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
+            V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
+
+            batch_size, num_heads, rows = self.shape[0], self.shape[
+                1], self.shape[2]
+            block_num = rows / self.blocksize
+            block_last = rows % self.blocksize
+            sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last
+            offset_shape = (batch_size, num_heads, rows + 1)
+            columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
+
+            offset = paddle.static.data(
+                name="Offset", shape=offset_shape, dtype="int32")
+            columns = paddle.static.data(
+                name="Columns", shape=columns_shape, dtype="int32")
+            Out = F.sparse_attention(Q, K, V, offset, columns)
+
+            Q_np = np.random.random(self.shape).astype(self.dtype)
+            K_np = np.random.random(self.shape).astype(self.dtype)
+            V_np = np.random.random(self.shape).astype(self.dtype)
+            offset_np, columns_np = init_csr_format(
+                self.shape[0], self.shape[1], self.shape[2], self.blocksize)
+            offset_np = offset_np.astype('int32')
+            columns_np = columns_np.astype('int32')
+
+            exe = fluid.Executor(self.place)
+            fetches_result = exe.run(feed={
+                "Q": Q_np,
+                "K": K_np,
+                "V": V_np,
+                "Offset": offset_np,
+                "Columns": columns_np
+            },
+                                     fetch_list=[Out])
+            expected_result, __, __ = ref_batch_sparse_attention(
+                Q_np, K_np, V_np, offset_np, columns_np)
+
+            self.assertTrue(
+                np.allclose(
+                    fetches_result, expected_result, atol=1e-5))
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        offset, columns = init_csr_format(self.shape[0], self.shape[1],
+                                          self.shape[2], self.blocksize)
+        offset = offset.astype('int32')
+        columns = columns.astype('int32')
+        query = np.random.random(self.shape).astype(self.dtype)
+        key = np.random.random(self.shape).astype(self.dtype)
+        value = np.random.random(self.shape).astype(self.dtype)
+
+        paddle_query = paddle.to_tensor(query, place=self.place)
+        paddle_key = paddle.to_tensor(key, place=self.place)
+        paddle_value = paddle.to_tensor(value, place=self.place)
+        paddle_offset = paddle.to_tensor(offset, place=self.place)
+        paddle_colunmns = paddle.to_tensor(columns, place=self.place)
+
+        paddle_result = F.sparse_attention(paddle_query, paddle_key,
+                                           paddle_value, paddle_offset,
+                                           paddle_colunmns)
+
+        numpy_result, __, __ = ref_batch_sparse_attention(query, key, value,
+                                                          offset, columns)
+        numpy_result = numpy_result.astype(self.dtype)
+
+        self.assertTrue(
+            np.allclose(
+                paddle_result.numpy(), numpy_result, atol=1e-5))
+
+
+class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (2, 2, 8, 4)
+        self.blocksize = 2
+        self.dtype = 'float32'
+
+
+class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (2, 2, 64, 32)
+        self.blocksize = 2
+        self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (2, 1, 64, 32)
+        self.blocksize = 2
+        self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (4, 4, 128, 32)
+        self.blocksize = 8
+        self.dtype = 'float64'
+
+
+class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+        self.shape = (3, 3, 35, 15)
+        self.blocksize = 3
+        self.dtype = 'float64'
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 7965b362b9c55..4151f25b94aff 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -112,6 +112,8 @@
 from ...fluid.layers import gather_tree  # noqa: F401
 from ...fluid.layers import temporal_shift  # noqa: F401
 
+from .sparse_attention import sparse_attention
+
 __all__ = [     #noqa
            'conv1d',
            'conv1d_transpose',
@@ -207,4 +209,5 @@
            'layer_norm',
            'instance_norm',
            'class_center_sample',
+           'sparse_attention',
 ]
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
new file mode 100644
index 0000000000000..f57669f11457f
--- /dev/null
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+import paddle
+from ...fluid.framework import in_dygraph_mode, default_main_program
+from paddle.fluid.layer_helper import LayerHelper
+from ...fluid.framework import in_dygraph_mode
+from paddle import _C_ops
+
+
+def sparse_attention(query,
+                     key,
+                     value,
+                     sparse_csr_offset,
+                     sparse_csr_columns,
+                     name=None):
+    r"""
+    This operator sparsify the Attention matrix in Transformer module
+    to achieve the effect of reducing memory consumption and computation. 
+    The sparse layout is expressed in CSR format and contains two parameters, 
+    ``offset`` and ``columns``.
+
+    .. math::
+
+        result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
+
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
+    The dimensions of the three parameters are the same. 
+    ``d`` represents the size of the last dimension of the three parameters.
+
+    Parameters:
+        query(Tensor): The query tensor in the Attention module. 
+                        It's a 4-D tensor with a shape of  
+                        :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+                        The dtype can be ``float32`` and ``float64``.
+        key(Tensor): The key tensor in the Attention module. 
+                        It's a 4-D tensor with a shape of  
+                        :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+                        The dtype can be ``float32`` and ``float64``.
+        value(Tensor): The value tensor in the Attention module. 
+                        It's a 4-D tensor with a shape of  
+                        :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+                        The dtype can be ``float32`` and ``float64``.
+        sparse_csr_offset(Tensor): The sparsity feature in the Attention module 
+                        is expressed in the CSR format, and the offset represents 
+                        the number of non-zero elements in each row of the matrix.
+                        It's a 3-D tensor with a shape of  
+                        :math:`[batch\_size, num\_heads, seq\_len + 1]`. 
+                        The dtype should be ``int32``.
+        sparse_csr_columns(Tensor): The sparsity feature in the Attention module 
+                        is expressed in the CSR format, and the columns represent 
+                        the column index values of non-zero elements in the matrix.
+                        It's a 3-D tensor with a shape of  
+                        :math:`[batch\_size, num\_heads, sparse\_nnz]`. 
+                        The dtype should be ``int32``.
+        name(str, optional): The default value is None. Normally there is no need for user
+                        to set this property. For more information, please refer to
+                        :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor which refers to the result in the Attention module. 
+        It's a 4-D tensor with a shape of  
+        :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. 
+        The dtype can be ``float32`` and ``float64``.
+
+    Examples:
+        .. code-block:: python
+
+            # required: skiptest
+            import paddle
+            import numpy as np
+            
+            query_data = np.array([[[[0, 1,], [2, 3],
+                    [ 0, 1], [2, 3]]]]).astype("float32")
+            key_data = np.array([[[[0, 1,], [2, 3],
+                            [ 0, 1], [2, 3]]]]).astype("float32")
+            value_data = np.array([[[[0, 1,], [2, 3],
+                            [ 0, 1], [2, 3]]]]).astype("float32")
+            sparse_csr_offset_data = np.array([[[0, 2,
+                            4, 6, 8]]]).astype("int32")
+            sparse_csr_columns_data = np.array([[[0, 1,
+                            0, 1, 2, 3, 2, 3]]]).astype("int32")
+            print(query_data.shape)
+            # (1, 1, 4, 2)
+            print(sparse_csr_offset_data.shape)
+            # (1, 1, 5)
+            print(sparse_csr_columns_data.shape)
+            # (1, 1, 8)
+            paddle.disable_static()
+            query = paddle.to_tensor(query_data, stop_gradient=False, 
+                            place=paddle.CUDAPlace(0))
+            key = paddle.to_tensor(key_data, stop_gradient=False, 
+                            place=paddle.CUDAPlace(0))
+            value = paddle.to_tensor(value_data, stop_gradient=False, 
+                            place=paddle.CUDAPlace(0))
+            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, 
+                            place=paddle.CUDAPlace(0))
+            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, 
+                            place=paddle.CUDAPlace(0))
+            output = paddle.nn.functional.sparse_attention(query, key, 
+                            value, offset, columns)
+            print(output)
+            
+            # [[[[1.60885942, 2.60885954],
+            #       [1.99830270, 2.99830270],
+            #       [1.60885942, 2.60885954],
+            #       [1.99830270, 2.99830270]]]]
+    """
+    if in_dygraph_mode():
+        result_attention, result_sdd, result_softmax = _C_ops.sparse_attention(
+            query, key, value, sparse_csr_offset, sparse_csr_columns)
+        return result_attention
+
+    helper = LayerHelper('sparse_attention', **locals())
+    dtype = helper.input_dtype(input_param_name='Q')
+    out = helper.create_variable_for_type_inference(dtype)
+    result_sdd = helper.create_variable_for_type_inference(dtype)
+    result_softmax = helper.create_variable_for_type_inference(dtype)
+    inputs = {
+        'Q': query,
+        'K': key,
+        'V': value,
+        'Offset': sparse_csr_offset,
+        'Columns': sparse_csr_columns
+    }
+    outputs = {
+        'Out': out,
+        'SparseDotSdd': result_sdd,
+        'Softmax': result_softmax
+    }
+    helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs)
+    return out

From 7b45a46e13fe057ca12a001dac7b8d6d24d9f211 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Mon, 11 Oct 2021 19:59:16 +0800
Subject: [PATCH 61/80] Add FLAGS_allreduce_record_one_event to remove event
 waiting number (#36263)

* add FLAGS_allreduce_record_one_event

* add more comments

* fix ut

* improve coverage

* fix ut, improve coverage
---
 .../details/computation_op_handle.cc          |  8 +-
 .../details/fused_all_reduce_op_handle.cc     | 85 +++++++++++++++++++
 .../details/fused_all_reduce_op_handle.h      |  7 ++
 paddle/fluid/platform/flags.cc                | 17 ++++
 .../unittests/test_dist_mnist_fleetapi.py     |  6 +-
 5 files changed, 120 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 2256b826ed501..60b8461668f6f 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -16,6 +16,8 @@
 
 #include <string>
 
+DECLARE_bool(allreduce_record_one_event);
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -31,11 +33,13 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
       scope_idx_(scope_idx) {}
 
 void ComputationOpHandle::RunImpl() {
-  WaitInputVarGenerated(place_);
+  if (!FLAGS_allreduce_record_one_event) {
+    WaitInputVarGenerated(place_);
+  }
 
   auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); };
 
-  if (is_lock_and_record_event_free_) {
+  if (is_lock_and_record_event_free_ || FLAGS_allreduce_record_one_event) {
     run_func();
   } else {
     this->RunAndRecordEvent(run_func);
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 8f45c364476a7..94507140a81d6 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_bool(skip_fused_all_reduce_check, false, "");
+DECLARE_bool(allreduce_record_one_event);
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -48,11 +50,80 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
       num_of_all_reduce_(num_of_all_reduce) {}
 #endif
 
+FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto destroy_event = [](gpuEvent_t event) {
+    if (event == nullptr) return;
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event));
+#endif
+  };
+  destroy_event(start_event_);
+  destroy_event(end_event_);
+#endif
+}
+
 void FusedAllReduceOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name());
   VLOG(4) << this->DebugString();
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) {
+    VLOG(10) << "FLAGS_allreduce_record_one_event=true";
+    PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false,
+                      platform::errors::Unimplemented(
+                          "The hierarchical allreduce does not support "
+                          "FLAGS_allreduce_record_one_event=true"));
+    PADDLE_ENFORCE_EQ(places_.size(), 1,
+                      platform::errors::Unimplemented(
+                          "FLAGS_allreduce_record_one_event=true is only valid "
+                          "when using one GPU device per process."));
+    PADDLE_ENFORCE_EQ(platform::is_gpu_place(places_[0]), true,
+                      platform::errors::Unimplemented(
+                          "FLAGS_allreduce_record_one_event=true is only valid "
+                          "when using GPU device."));
+    auto create_event = [](gpuEvent_t *event) {
+      if (*event) return;
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          hipEventCreateWithFlags(event, hipEventDisableTiming));
+#else
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          cudaEventCreateWithFlags(event, cudaEventDisableTiming));
+#endif
+    };
+    create_event(&start_event_);
+    create_event(&end_event_);
+  }
+
+  gpuStream_t nccl_stream{nullptr};
+  gpuStream_t compute_stream{nullptr};
+
+  if (FLAGS_allreduce_record_one_event) {
+    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]);
+    compute_stream =
+        platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream();
+    auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_);
+    auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device);
+    nccl_stream = nccl_ctx.stream();
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipStreamWaitEvent(nccl_stream, start_event_, 0));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(nccl_stream, start_event_, 0));
+#endif
+  } else {
+    WaitInputVarGenerated();
+  }
+#else
   WaitInputVarGenerated();
+#endif
+
   // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
   // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)...
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
@@ -94,6 +165,20 @@ void FusedAllReduceOpHandle::RunImpl() {
   } else {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (FLAGS_allreduce_record_one_event) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        hipStreamWaitEvent(compute_stream, end_event_, 0));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaStreamWaitEvent(compute_stream, end_event_, 0));
+#endif
+  }
+#endif
 }
 
 void FusedAllReduceOpHandle::FusedAllReduceFunc(
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index d22dc0a421ac0..8473700867ce3 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -67,12 +67,19 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
 #endif
   std::string Name() const override;
 
+  ~FusedAllReduceOpHandle();
+
  protected:
   void RunImpl() override;
 
  private:
   size_t num_of_all_reduce_;
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  gpuEvent_t start_event_{nullptr};
+  gpuEvent_t end_event_{nullptr};
+#endif
+
   // Check the dtype of the input
   void GetDTypeAndNumel(
       const std::vector<std::pair<std::string, const LoDTensor *>> &g_tensor,
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 18636f6f84278..dd65d743fad31 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -682,6 +682,23 @@ PADDLE_DEFINE_EXPORTED_bool(
     "It controls whether to apply IR pass to program when using Fleet APIs");
 
 /**
+ * Distributed related FLAG
+ * Name: FLAGS_allreduce_record_one_event
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example: FLAGS_allreduce_record_one_event=true makes the allreduce
+ *          operations would only wait one event instead of multiple events.
+ * Note: Make the allreduce operations would only wait one event instead of
+ *       multiple events. Currently, only fuse allreduce supports this.
+ *       Otherwise, the precision may be wrong.
+ */
+PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false,
+                            "It controls whether the allreduce operations "
+                            "would only wait one event instead of multiple "
+                            "events. Currently, only fuse allreduce supports "
+                            "this. Otherwise, the precision may be wrong.");
+
+/*
  * CINN related FLAG
  * Name: FLAGS_use_cinn
  * Since Version: 2.3
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
index 34abc5b45531a..3b15b06b5efa8 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
@@ -32,7 +32,11 @@ def _setup_config(self):
     def test_dist_train(self):
         import paddle.fluid as fluid
         if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1e-5)
+            self.check_with_place(
+                "dist_mnist.py",
+                delta=1e-5,
+                check_error_log=True,
+                need_envs={'FLAGS_allreduce_record_one_event': '1'})
 
 
 class FleetCollectiveTest(unittest.TestCase):

From 339cb1917eb8efd8d190d3490b1aadf1f2d1a615 Mon Sep 17 00:00:00 2001
From: jakpiase <62569058+jakpiase@users.noreply.github.com>
Date: Mon, 11 Oct 2021 14:11:41 +0200
Subject: [PATCH 62/80] fix for matmul_v2 6D x 2D (#36342)

---
 .../operators/mkldnn/matmul_v2_mkldnn_op.cc   |  8 +++----
 .../mkldnn/test_matmul_v2_mkldnn_op.py        | 21 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index 57a3c38559316..c332b9194164e 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -148,8 +148,8 @@ class MatMulV2MKLDNNKernel
     if (x_dims.size() == 1) {
       x_bd_dims[x_bd_dims.size() - 1] = x_dims[0];
     } else if (x_dims.size() == 2) {
-      x_bd_dims[2] = x_dims[1];
-      x_bd_dims[1] = x_dims[0];
+      x_bd_dims[x_bd_dims.size() - 1] = x_dims[1];
+      x_bd_dims[x_bd_dims.size() - 2] = x_dims[0];
     } else {
       for (size_t i = 0; i < x_dims.size(); ++i) {
         x_bd_dims[i] = x_dims[i];
@@ -158,8 +158,8 @@ class MatMulV2MKLDNNKernel
     if (y_dims.size() == 1) {
       y_bd_dims[x_bd_dims.size() - 2] = y_dims[0];
     } else if (y_dims.size() == 2) {
-      y_bd_dims[2] = y_dims[1];
-      y_bd_dims[1] = y_dims[0];
+      y_bd_dims[y_bd_dims.size() - 1] = y_dims[1];
+      y_bd_dims[y_bd_dims.size() - 2] = y_dims[0];
     } else {
       for (size_t i = 0; i < y_dims.size(); ++i) {
         y_bd_dims[i] = y_dims[i];
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
index 5cc6651bb0ec8..994d78126bda5 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py
@@ -235,6 +235,22 @@ def config(self):
         self.trans_y = True
 
 
+class TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (1, 1, 2, 1, 8, 9)
+        self.y_shape = (9, 12)
+        self.trans_x = False
+        self.trans_y = False
+
+
+class TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp):
+    def config(self):
+        self.x_shape = (20, 5)
+        self.y_shape = (1, 2, 1, 5, 11)
+        self.trans_x = False
+        self.trans_y = False
+
+
 #   BF16 TESTS
 def create_bf16_test_class(parent):
     @OpTestTool.skip_if_not_cpu_bf16()
@@ -274,7 +290,8 @@ def calculate_grads(self):
                 2: [1, 0],
                 3: [0, 2, 1],
                 4: [0, 1, 3, 2],
-                5: [0, 1, 2, 4, 3]
+                5: [0, 1, 2, 4, 3],
+                6: [0, 1, 2, 3, 5, 4]
             }
 
             # expand vector so it will be a valid matrix for multiplication
@@ -370,6 +387,8 @@ def calculate_grads(self):
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp)
 create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp)
+create_bf16_test_class(TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp)
+create_bf16_test_class(TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp)
 
 if __name__ == "__main__":
     paddle.enable_static()

From e5b4dd7386486610a183460e88e21b8899bd1d55 Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Mon, 11 Oct 2021 20:47:08 +0800
Subject: [PATCH 63/80] [heterps] add fuse_allreduce  (#35131)

* heterps:add fuse_allreduce op; test=develop
* add program_mode in minimize for pslib mode;test=develop
---
 python/paddle/distributed/fleet/utils/fs.py   |  13 +-
 .../fleet/parameter_server/pslib/__init__.py  |  13 +-
 python/paddle/fluid/transpiler/collective.py  | 267 +++++++++++++++++-
 3 files changed, 284 insertions(+), 9 deletions(-)

diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index d3f84d50ac8f9..f56580f8ca2fe 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -468,10 +468,17 @@ def __init__(
         self._bd_err_re = re.compile(
             r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:')
 
-    def _run_cmd(self, cmd, redirect_stderr=False):
+    def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5):
         exe_cmd = "{} -{}".format(self._base_cmd, cmd)
-        ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
-        ret = int(ret)
+        ret = 0
+        output = None
+        retry_sleep_second = 3
+        for x in range(retry_times + 1):
+            ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr)
+            ret = int(ret)
+            if ret == 0:
+                break
+            time.sleep(retry_sleep_second)
         if ret == 134:
             raise FSShellCmdAborted(cmd)
         return ret, output.splitlines()
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
index d245ce222ca6c..78af7fd65dccb 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
@@ -1091,7 +1091,8 @@ def minimize(self,
                  scopes=None,
                  startup_programs=None,
                  parameter_list=None,
-                 no_grad_set=None):
+                 no_grad_set=None,
+                 program_mode="all_reduce"):
         """
         minimize a program through loss, loss can be a list in DistributedOptimizer.
         Note that in parameter server mode, a worker will not get anything about optimize_os
@@ -1105,6 +1106,7 @@ def minimize(self,
                 in `parameter_list`.
             parameter_list (list): list of Variables to update.
             no_grad_set (set|None): set of Variables should be ignored.
+            program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu. 
         Returns:
             tuple: (optimize_ops, params_grads) which are, list of operators appended;
             and list of (param, grad) Variables pair for optimization.
@@ -1139,12 +1141,17 @@ def minimize(self,
         if opt_info["use_ps_gpu"]:
             from paddle.fluid.transpiler.collective import MultiThread
             # check start program
-
+            if program_mode not in [
+                    "all_reduce", "fuse_all_reduce", "all_gather"
+            ]:
+                raise ValueError("You should set program_mode in [ all_reduce, \
+                                fuse_all_reduce, all_gather ]")
             env = self.get_dist_env()
             if not isinstance(losses, list):
                 startup_programs = [startup_programs]
             for i in range(0, len(startup_programs)):
-                t = MultiThread()
+
+                t = MultiThread(trans_mode=program_mode)
                 start_program = startup_programs[i]
                 main_program = programs[i]
                 t.transpile(
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index ec8602ec7e672..ea88a89e68224 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -65,7 +65,7 @@ def transpile(self, startup_program, main_program, rank, endpoints,
             self.main_program = default_main_program()
 
         self.nranks = len(endpoints)
-        if self.nranks == 1 and self.mode != "single_process_multi_thread":
+        if self.nranks == 1 and self.mode != "single_process_multi_thread" and self.mode != "box":
             raise ValueError('the number of endpoints must > 1')
 
         if rank < 0:
@@ -441,9 +441,14 @@ class MultiThread(GradAllReduce):
     '''
     '''
 
-    def __init__(self, nrings=1):
+    def __init__(self, nrings=1, trans_mode="all_reduce"):
         GradAllReduce.__init__(self, nrings)
-        self.mode = "single_process_multi_thread"
+        self.mode = "box"
+        self.trans_mode = trans_mode
+        self.fuse_grad_size_in_num = 128
+        gpu_nums = os.getenv("FLAGS_selected_gpus",
+                             "0,1,2,3,4,5,6,7,8").split(",")
+        self.gpu_num = len(gpu_nums)
 
     def _transpile_startup_program(self):
         if len(self.endpoints) > 1:
@@ -460,3 +465,259 @@ def _transpile_startup_program(self):
             print("begin to _transpile_startup_program for single-node")
             block = self.startup_program.global_block()
             block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
+
+    def _transpile_main_program(self):
+        self._insert_scale_loss_grad_ops()
+        if self.trans_mode == "all_gather":
+            print("begin to transpile in all-gather mode")
+            self.allgather_ranks = self.nranks * self.gpu_num
+            self._insert_allgather_ops()
+            self._update_adam_ops()
+        elif self.trans_mode == "fuse_all_reduce":
+            print("begin to transpile in fuse all-reduce mode")
+            self._insert_fuse_allreduce_ops()
+        else:
+            print("begin to transpile in all-reduce mode")
+            self._insert_allreduce_ops()
+
+    def _insert_allgather_ops(self):
+        """
+        insert allgather op to the main_program
+        """
+        block = self.main_program.global_block()
+        ring_id = -1
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if self._is_backward_op(op) and \
+                    self.op_role_var_key in op.attr_names:
+                op_role_var = op.all_attrs()[self.op_role_var_key]
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    new_grad_var = block.create_var(
+                        name=op_role_var[i] + "_allgather",
+                        shape=[self.allgather_ranks] + list(param.shape),
+                        persistable=False,
+                        dtype=core.VarDesc.VarType.FP32,
+                        stop_gradient=True)
+                    grad = block.vars[op_role_var[i + 1]]
+                    if param.is_distributed:  # no need to care: used in PLSC
+                        continue
+
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={self.op_role_key: OpRole.Backward})
+                        offset += 1
+
+                    # As we search ops reversedly, we should insert c_allgather
+                    # op in the same way to keep the ring_id alternate
+                    ring_id = (ring_id + 1) % self.nrings
+                    block._insert_op(
+                        offset,
+                        type='c_allgather',
+                        inputs={'X': grad},
+                        outputs={'Out': new_grad_var},
+                        attrs={
+                            'nranks': self.allgather_ranks,
+                            'ring_id': ring_id,
+                            self.op_role_key: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in enumerate(block.ops):
+            if self._is_optimizer_op(op):
+                for ring_id in range(self.nrings):
+                    block._insert_op(
+                        idx + ring_id,
+                        type='c_sync_comm_stream',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            self.op_role_key: OpRole.Backward
+                        })
+                break
+
+    def _update_adam_ops(self):
+        """
+        remove the original adam op, and add new adam ops
+        """
+        block = self.main_program.global_block()
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if self._is_optimizer_op(op):
+                offset = idx
+                if op.type != 'adam' and op.type != 'lamb':  # filter out scale op
+                    continue
+                param_name = op.input("Param")[0]
+                inputs = {
+                    "Param": block.vars[op.input("Param")[0]],
+                    "LearningRate": block.vars[op.input("LearningRate")[0]],
+                    "Moment1": block.vars[op.input("Moment1")[0]],
+                    "Moment2": block.vars[op.input("Moment2")[0]],
+                    "Beta1Pow": block.vars[op.input("Beta1Pow")[0]],
+                    "Beta2Pow": block.vars[op.input("Beta2Pow")[0]]
+                }
+                outputs = {
+                    "ParamOut": block.vars[op.output("ParamOut")[0]],
+                    "Moment1Out": block.vars[op.output("Moment1Out")[0]],
+                    "Moment2Out": block.vars[op.output("Moment2Out")[0]],
+                    "Beta1PowOut": block.vars[op.output("Beta1PowOut")[0]],
+                    "Beta2PowOut": block.vars[op.output("Beta2PowOut")[0]]
+                }
+                attrs = {
+                    "epsilon": op.attr('epsilon'),
+                    "beta1": op.attr('beta1'),
+                    "beta2": op.attr('beta2'),
+                    "lazy_mode": op.attr('lazy_mode'),
+                    "min_row_size_to_use_multithread":
+                    op.attr('min_row_size_to_use_multithread')
+                }
+                split_vars = [
+                    block.create_var(
+                        name=param_name + "_" + str(i),
+                        shape=block.vars[op.input("Param")[0]].shape,
+                        persistable=False,
+                        dtype=core.VarDesc.VarType.FP32,
+                        stop_gradient=True) for i in range(self.allgather_ranks)
+                ]
+                block._insert_op(
+                    offset,
+                    type="split",
+                    inputs={
+                        'X': block.vars[op.input("Param")[0] + "_allgather"]
+                    },
+                    outputs={'Out': split_vars},
+                    attrs={'num': self.allgather_ranks,
+                           'axis': 0})
+                offset += 1
+
+                for i in range(self.allgather_ranks):
+                    inputs["Grad"] = split_vars[i]
+                    block._insert_op(
+                        offset,
+                        type=op.type,
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs)
+                    offset += 1
+                # remove the original adam op
+                block._remove_op(offset)
+
+    def _insert_fuse_allreduce_ops(self):
+        """
+        insert coalesce_tensor and all reduce ops
+        """
+        block = self.main_program.global_block()
+        ring_id = 0 % self.nrings
+        grad = None
+        param_grads = []
+        # find all grad params
+        for op in reversed(block.ops):
+            if self._is_backward_op(op) and \
+                    self.op_role_var_key in op.attr_names:
+                op_role_var = op.all_attrs()[self.op_role_var_key]
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0, "vars need to be one param var followed by one grad var, " \
+                                                  "but got odd number of vars"
+                for i in range(0, len(op_role_var), 2):
+                    param_name = op_role_var[i]
+                    param = block.var(param_name)
+                    grad_name = op_role_var[i + 1]
+                    grad = block.var(grad_name)
+                    if param.is_distributed:
+                        continue
+                    param_grads.append(grad)
+        if grad is None:
+            return
+
+        segments = []
+        last_dtype = None
+        # split the grad based on dtype and fused size
+        for var in param_grads:
+            if len(segments) == 0 \
+                    or len(segments[-1]) == self.fuse_grad_size_in_num \
+                    or var.dtype != last_dtype:
+                segments.append([var])
+                last_dtype = var.dtype
+            else:
+                segments[-1].append(var)
+
+        fused_vars = []
+        for idx, op in enumerate(block.ops):
+            if self._is_optimizer_op(op):
+                for segment in segments:
+                    # insert coalesce tensor
+                    tmp_var = block.create_var(
+                        name=unique_name.generate('FusedOutput_{}'.format(
+                            segment[0].name)),
+                        dtype=segment[0].dtype,
+                        persistable=False,
+                        stop_gradient=True)
+                    fused_vars.append(tmp_var)
+                    block._insert_op(
+                        idx,
+                        type="coalesce_tensor",
+                        inputs={"Input": segment},
+                        outputs={"Output": segment,
+                                 "FusedOutput": tmp_var},
+                        attrs={
+                            "copy_data": True,
+                            "use_align": True,
+                            "dtype": segment[0].dtype,
+                            self.op_role_key: OpRole.Backward
+                        })
+                break
+
+        # insert the allreduce_sum op
+        for idx, op in enumerate(block.ops):
+            if self._is_optimizer_op(op):
+                for fused_var in fused_vars:
+                    block._insert_op(
+                        idx,
+                        type='c_allreduce_sum',
+                        inputs={'X': fused_var},
+                        outputs={'Out': fused_var},
+                        attrs={
+                            'ring_id': ring_id,
+                            'use_calc_stream': False,
+                            self.op_role_key: OpRole.Backward
+                        })
+                    block._insert_op(
+                        idx,
+                        type='c_sync_calc_stream',
+                        inputs={'X': fused_var},
+                        outputs={'Out': fused_var},
+                        attrs={self.op_role_key: OpRole.Backward})
+                break
+
+        if len(fused_vars) == 0:
+            block._sync_with_cpp()
+            return
+
+        # insert the sync comm op
+        for idx, op in enumerate(block.ops):
+            if self._is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': fused_vars[0]},
+                    outputs={'Out': fused_vars[0]},
+                    attrs={
+                        'ring_id': ring_id,
+                        self.op_role_key: OpRole.Backward
+                    })
+                break
+        block._sync_with_cpp()

From 6d353aa524770279a9b216e011d6623b7be0ea35 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 11 Oct 2021 20:59:49 +0800
Subject: [PATCH 64/80] refine auto_growth allocator (#35732)

* do not use alignedAllocator when cuda has alignment

* update test

* fix error during multiple process
---
 .../memory/allocation/aligned_allocator.cc    |  1 +
 .../memory/allocation/allocator_facade.cc     | 36 ++++++++++++++++++-
 .../auto_growth_best_fit_allocator.cc         | 15 ++++----
 .../auto_growth_best_fit_allocator_test.cc    | 14 +++++---
 4 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index 1d89918bfebf6..f0b7f1a4b0d9e 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -20,6 +20,7 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
+// For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
   AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 0388e2d13afb0..281902f3a2b12 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -23,6 +23,7 @@
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
@@ -201,6 +202,8 @@ class AllocatorFacadePrivate {
 
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
+    VLOG(4) << "GetAllocator"
+            << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
                                                           : GetAllocatorMap())
@@ -256,8 +259,39 @@ class AllocatorFacadePrivate {
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
-        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+        underlying_allocator, alignment, 0, allow_free_idle_chunk);
   }
 #endif
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index f36d589f907fb..9f34f5198a179 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -40,14 +40,14 @@ namespace allocation {
 AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
     const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
     size_t chunk_size, bool allow_free_idle_chunk)
-    : underlying_allocator_(
-          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
+    : underlying_allocator_(underlying_allocator),
       alignment_(alignment),
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
-  size = AlignedSize(size, alignment_);
+Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+  size_t size = AlignedSize(unaligned_size, alignment_);
+  VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
   std::lock_guard<SpinLock> guard(spinlock_);
   auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
@@ -57,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     free_blocks_.erase(iter);
     auto *chunk = block_it->chunk_;
     size_t remaining_size = block_it->size_ - size;
+    VLOG(10) << "Allocate " << size << " bytes from chunk size "
+             << block_it->size_ << ", remaining " << remaining_size;
     if (remaining_size == 0) {
       block_it->is_free_ = false;
     } else {
@@ -95,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) {
     }
     blocks.emplace_back(p + remaining_size, size, false, chunk);
     block_it = --(blocks.end());
-    VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining "
-            << remaining_size;
+    VLOG(2) << "Not found and reallocate " << realloc_size << "("
+            << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
   return new BlockAllocation(block_it);
 }
 
 void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
+  VLOG(10) << "Free " << allocation->size() << " bytes";
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 6f2591c8b15c8..926af8292d2e8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
-
 #include <cstdlib>
 
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
+
 #include "gtest/gtest.h"
 
 DECLARE_bool(free_idle_chunk);
@@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
   FLAGS_free_idle_chunk = free_idle_chunk;
   FLAGS_free_when_no_cache_hit = free_when_no_cache_hit;
   auto recorded_allocator = std::make_shared<RecordedAllocator>();
+
   size_t alignment = 4096;
   size_t memory_size = 8192;
+  auto underlying_allocator =
+      std::make_shared<AlignedAllocator>(recorded_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      recorded_allocator, alignment);
+      underlying_allocator, alignment);
 
   for (size_t i = 0; i < 10; ++i) {
     auto allocation = ag_allocator->Allocate(memory_size);
@@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) {
 
   auto underlying_allocator =
       std::make_shared<LimitedResourceAllocator>(memory_capacity);
+  auto aligned_allocator =
+      std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
   auto ag_allocator = std::make_shared<AutoGrowthBestFitAllocator>(
-      underlying_allocator, alignment);
+      aligned_allocator, alignment);
 
   ag_allocator->Allocate(allocate_size[0]);
   ASSERT_EQ(underlying_allocator->AllocatedSize(),

From 2a75b44727173dd4317adb61648f27bfbedbeecc Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 12 Oct 2021 10:03:57 +0800
Subject: [PATCH 65/80] Fix stop_gradient in RunProgramOp (#36339)

* Fix stop_gradient in RunProgramOp

* fix reference
---
 paddle/fluid/operators/run_program_op.h       | 26 +++++++---
 .../tests/unittests/test_run_program_op.py    | 48 +++++++++++++++++++
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index ac352876e7871..04e4dc62b039b 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -142,10 +142,15 @@ static void ShareVarsIntoScope(const std::vector<Variable *> &vars,
 
 static void ShareVarsFromScope(const std::vector<Variable *> &vars,
                                const std::vector<std::string> &var_names,
+                               const BlockDesc &global_block,
                                framework::Scope *scope) {
   for (size_t i = 0; i < vars.size(); ++i) {
+    // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all
+    // parameters before generating out_tmp have no @GRAD, it will raise error
+    // because we can't findthem in scope. So we skip sharing these vars or
+    // var@GRAD if they don't appear in global block.
     if (var_names[i] == framework::kEmptyVarName ||
-        var_names[i] == "Fake_var") {
+        var_names[i] == "Fake_var" || !global_block.HasVar(var_names[i])) {
       VLOG(2) << "find variable name is " << var_names[i] << ", skip it!";
       continue;
     }
@@ -214,8 +219,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     details::ShareVarsIntoScope(input_vars, input_var_names, &scope);
     details::ShareVarsIntoScope(param_vars, param_names, &scope);
 
+    auto *global_block = ctx.Attr<BlockDesc *>("global_block");
+
     if (end_op_index > start_op_index) {
-      auto *program = ctx.Attr<BlockDesc *>("global_block")->Program();
+      auto *program = global_block->Program();
       auto cache_info = framework::GetExecutorInfoFromCache(
           *program, ctx.GetPlace(), start_op_index, end_op_index,
           /*is_grad=*/false, program_id, &scope);
@@ -240,8 +247,10 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
       parallel_executor->RunWithoutFetch(skip_eager_delete_vars);
     }
     // Step 4. Get Output
-    details::ShareVarsFromScope(output_vars, output_var_names, &scope);
-    details::ShareVarsFromScope(dout_vars, dout_var_names, &scope);
+    details::ShareVarsFromScope(output_vars, output_var_names, *global_block,
+                                &scope);
+    details::ShareVarsFromScope(dout_vars, dout_var_names, *global_block,
+                                &scope);
 
     // Debug info: scope info when run end
     VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
@@ -307,10 +316,11 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
                           "least one sub scope."));
 
     auto &scope = *(global_inner_scope->kids().front());
+    auto *global_block = ctx.Attr<BlockDesc *>("global_block");
 
     if (end_op_index > start_op_index) {
       // Step 2. prepare executor and scope
-      auto *program = ctx.Attr<BlockDesc *>("global_block")->Program();
+      auto *program = global_block->Program();
       auto cache_info = framework::GetExecutorInfoFromCache(
           *program, ctx.GetPlace(), start_op_index, end_op_index,
           /*is_grad*/ true, program_id, &scope);
@@ -341,8 +351,10 @@ class RunProgramGradOpKernel : public framework::OpKernel<T> {
     }
 
     // Step 4. get outputs
-    details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, &scope);
-    details::ShareVarsFromScope(param_grad_vars, param_grad_names, &scope);
+    details::ShareVarsFromScope(input_grad_vars, input_grad_var_names,
+                                *global_block, &scope);
+    details::ShareVarsFromScope(param_grad_vars, param_grad_names,
+                                *global_block, &scope);
 
     // Step5. drop current scope
     global_inner_scope->DeleteScope(&scope);
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index b3d0845a4fbbc..33b32a6632c9e 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -343,5 +343,53 @@ def build_model(self):
         return fwd_op_num
 
 
+class Net(paddle.nn.Layer):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.fc1 = paddle.nn.Linear(10, 10)
+        self.fc2 = paddle.nn.Linear(10, 1)
+
+    def forward(self, x):
+        out = self.fc1(x)
+        out.stop_gradient = True
+        out = self.fc2(out)
+        return out
+
+
+class TestParametersWithStopGradient(unittest.TestCase):
+    def setUp(self):
+        self.seed = 2021
+        self.iter = 5
+
+    def train(self, to_static):
+        # prepare env
+        paddle.seed(self.seed)
+
+        net = Net()
+        if to_static:
+            net = paddle.jit.to_static(net)
+        sgd = paddle.optimizer.SGD(0.01, parameters=net.parameters())
+
+        for i in range(self.iter):
+            x = paddle.rand([4, 10])
+            out = net(x)
+            loss = paddle.mean(out)
+
+            loss.backward()
+            sgd.minimize(loss)
+            net.clear_gradients()
+
+        return loss
+
+    def test_stop_gradient(self):
+        paddle.disable_static()
+
+        dy_loss = self.train(to_static=False)
+        st_loss = self.train(to_static=True)
+        self.assertEqual(dy_loss[0], st_loss[0])
+
+        paddle.enable_static()
+
+
 if __name__ == "__main__":
     unittest.main()

From 0594d2a7f086cc64b58f01aeb0299cc06c683825 Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 12 Oct 2021 10:05:52 +0800
Subject: [PATCH 66/80] Revert "refine case when thread_num = 1 (#36201)"
 (#36347)

This reverts commit 7e60cc63c33f0c17df36b0ee52ae50a3d04a6697.
---
 .../fast_threaded_ssa_graph_executor.cc       | 20 +++----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
index eb027d7c2f636..75998e4582e2b 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -47,16 +47,7 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
         << "Change thread number to 1 because the toposort order is unique";
     strategy_.num_threads_ = 1;
   }
-  if (strategy_.num_threads_ > 1) {
-    pool_.reset(new ::ThreadPool(strategy.num_threads_));
-  } else {
-    auto nodes = ir::TopologySortOperations(*graph_);
-    traced_ops_.clear();
-    traced_ops_.reserve(nodes.size());
-    for (auto *node : nodes) {
-      traced_ops_.push_back(&node->Wrapper<OpHandleBase>());
-    }
-  }
+  pool_.reset(new ::ThreadPool(strategy.num_threads_));
   for (auto &op : ir::FilterByNodeWrapper<OpHandleBase>(*graph_)) {
     int dep = static_cast<int>(op->NotReadyInputSize());
     op_deps_.emplace(op, dep);
@@ -239,7 +230,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     OpHandleBase *op,
     const std::shared_ptr<BlockingQueue<size_t>> &complete_q) {
   ++remaining_;
-  auto func = [=] {
+  this->pool_->enqueue([=] {
     std::deque<OpHandleBase *> op_queue;
     op_queue.push_front(op);
 
@@ -298,12 +289,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
     }
     --remaining_;
     complete_q->Push(complete);
-  };
-  if (pool_) {
-    pool_->enqueue(func);
-  } else {
-    func();
-  }
+  });
 }
 
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {

From ec148cab5be5e7298203d2cd5c294b41c0622d8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?LJQ=E2=9D=A4=EF=B8=8F?=
 <33169170+lijiaqi0612@users.noreply.github.com>
Date: Tue, 12 Oct 2021 10:29:03 +0800
Subject: [PATCH 67/80] fft: modify sample code result (#36325)

---
 python/paddle/tensor/fft.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py
index f7990e3f89107..20fd143589fa4 100644
--- a/python/paddle/tensor/fft.py
+++ b/python/paddle/tensor/fft.py
@@ -339,7 +339,7 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
             xp = paddle.to_tensor(x)
             irfft_xp = paddle.fft.irfft(xp).numpy()
             print(irfft_xp)
-            #  [0. 0. 0. 4.]
+            #  [0. 1. 0. 0.]
 
     """
     return fft_c2r(x, n, axis, norm, forward=False, name=name)
@@ -477,7 +477,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             import numpy as np
             import paddle
 
-            x = x = np.mgrid[:4, :4, :4][1]
+            x = np.mgrid[:4, :4, :4][1]
             xp = paddle.to_tensor(x)
             fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy()
             print(fftn_xp)
@@ -631,9 +631,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
         # use axes(2, 0)
         print(paddle.fft.rfftn(x, axes=(2, 0)))
         # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
+        #        [[[(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ],
+        #          [(8+0j), 0j     , 0j     ]],
         #
         #         [[0j     , 0j     , 0j     ],
         #          [0j     , 0j     , 0j     ],
@@ -1267,9 +1267,8 @@ def fftshift(x, axes=None, name=None):
             import paddle
 
             x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
             n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
             res = paddle.fft.fftshift(fftfreq_xp).numpy()
             print(res)
             #  [-1.3333334 -0.6666667  0.         0.6666667  1.3333334]
@@ -1311,9 +1310,8 @@ def ifftshift(x, axes=None, name=None):
             import paddle
 
             x = np.array([3, 1, 2, 2, 3], dtype=float)
-            scalar_temp = 0.3
             n = x.size
-            fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp)
+            fftfreq_xp = paddle.fft.fftfreq(n, d=0.3)
             res = paddle.fft.ifftshift(fftfreq_xp).numpy()
             print(res)
             #  [ 1.3333334 -1.3333334 -0.6666667  0.         0.6666667]

From d247cf17d11e2ee32921c0b321bafb28d7a3477d Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Tue, 12 Oct 2021 10:59:47 +0800
Subject: [PATCH 68/80] =?UTF-8?q?fix=20bugs=20in=20mp=5Flayers=E3=80=81pp?=
 =?UTF-8?q?=5Flayers=20and=20HybridParallelClipGrad=20(#36144)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix calling bug of HybridParallelClipGrad

* fix bugs of HybridParallelClipGrad

* add unittest of pp with HybridParallelClipGrad

* fix bugs in mp_layers.py

* update

* fix bugs in pp_layers.py

* update
---
 .../hybrid_parallel_optimizer.py              | 36 ++++++++++++-------
 .../parallel_layers/mp_layers.py              |  8 ++---
 .../parallel_layers/pp_layers.py              |  7 ++++
 .../unittests/hybrid_parallel_pp_alexnet.py   | 17 ++++-----
 .../unittests/hybrid_parallel_pp_clip_grad.py | 35 ++++++++++++++++++
 ...test_parallel_dygraph_pipeline_parallel.py |  3 ++
 6 files changed, 81 insertions(+), 25 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py

diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index 76e326ce20d7c..6cd875905864b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -52,6 +52,7 @@ def _dygraph_clip(self, params_grads):
         params_and_grads = []
         sum_square_list_dist = []
         sum_square_list_not_dist = []
+
         for p, g in params_grads:
             if g is None:
                 continue
@@ -64,29 +65,38 @@ def _dygraph_clip(self, params_grads):
             square = layers.square(merge_grad)
             sum_square = layers.reduce_sum(square)
 
-            if p.is_distributed:
-                sum_square_list_dist.append(sum_square)
-            else:
-                sum_square_list_not_dist.append(sum_square)
+            not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or (
+                hasattr(p, 'is_firstly_shared') and
+                getattr(p, 'is_firstly_shared', True))
 
-        # all parameters have been filterd out
-        if len(sum_square_list_dist) + len(sum_square_list_not_dist) == 0:
-            return params_grads
+            if not_shared_enable:
+                if p.is_distributed:
+                    sum_square_list_dist.append(sum_square)
+                else:
+                    sum_square_list_not_dist.append(sum_square)
 
         global_norm_var_dist = layers.concat(sum_square_list_dist) if len(
             sum_square_list_dist) != 0 else layers.concat(
                 [paddle.to_tensor([0.])])
         global_norm_var_dist = layers.reduce_sum(global_norm_var_dist)
+
         global_norm_var_not_dist = layers.concat(
             sum_square_list_not_dist) if len(
                 sum_square_list_not_dist) != 0 else layers.concat(
                     [paddle.to_tensor([0.])])
         global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist)
 
-        # add all reduce to get global norm of distributed params_and_grads in world size
-        # all reduce is not needed while getting global norm of non-distributed params_and_grads
-        paddle.distributed.all_reduce(
-            global_norm_var_dist, group=self._hcg.get_check_parallel_group())
+        # add all reduce to get global norm of distributed params_and_grads
+        if self._hcg.get_model_parallel_world_size() > 1:
+            paddle.distributed.all_reduce(
+                global_norm_var_dist,
+                group=self._hcg.get_check_parallel_group())
+
+        # add all reduce to get global norm of non-distributed params_and_grads in groups of pp
+        if self._hcg.get_pipe_parallel_world_size() > 1:
+            paddle.distributed.all_reduce(
+                global_norm_var_not_dist,
+                group=self._hcg.get_pipe_parallel_group())
 
         # In Sharding mode, param and grad is mapping different rank in optimizer.
         # ClipGradByGlobalNorm need allreduce to get globol norm
@@ -143,8 +153,8 @@ def __init__(self, optimizer, hcg, strategy):
 
         if isinstance(self._inner_opt._grad_clip,
                       ClipGradByGlobalNorm) and not self._use_dp_mode:
-            logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \
-                  "optmizer'grad clip will be changed.")
+            logger.warning("While using ClipGradByGlobalNorm in TensorParallel, PipelineParallel " \
+                           "or Sharding, the grad clip of original optimizer will be changed.")
 
             if self._sharding_enable:
                 # change sharding inner_optimizer's _grad_clip
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 2555d73462b78..2ce8cf7bdeb74 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -70,7 +70,7 @@ def __init__(self,
                 dtype=self._dtype,
                 is_bias=False)
 
-        self.weight.is_distributed = True
+        self.weight.is_distributed = True if self.is_mp else False
 
     def forward(self, x):
         if self.is_mp:
@@ -135,7 +135,7 @@ def __init__(self,
                 dtype=self._dtype,
                 is_bias=False)
 
-        self.weight.is_distributed = True
+        self.weight.is_distributed = True if self.is_mp else False
 
         if has_bias:
             # initialize bias to zero like Megatron
@@ -144,7 +144,7 @@ def __init__(self,
                 attr=paddle.nn.initializer.Constant(value=0.0),
                 dtype=self._dtype,
                 is_bias=True)
-            self.bias.is_distributed = True
+            self.bias.is_distributed = True if self.is_mp else False
         else:
             self.bias = None
 
@@ -212,7 +212,7 @@ def __init__(self,
                 dtype=self._dtype,
                 is_bias=False)
 
-        self.weight.is_distributed = True
+        self.weight.is_distributed = True if self.is_mp else False
 
         if has_bias:
             self.bias = self.create_parameter(
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index db6fc964895ff..9920bbd400c70 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -261,6 +261,10 @@ def _synchronize_shared_weights(self):
                     src=min(comm['ranks']),
                     group=comm['group'])
 
+            for param in comm['layer'].parameters():
+                if self.global_rank != min(comm['ranks']):
+                    setattr(param, 'is_firstly_shared', False)
+
     def allreduce_shared_weight_gradients(self):
         for key, comm in self.shared_comm.items():
             param = getattr(self.shared_layers[key], comm['weight_attr'])
@@ -316,6 +320,9 @@ def _build_layer(self):
                     self.shared_layers[layer.layer_name] = layer.build_layer()
                     self.shared_weight_attrs[
                         layer.layer_name] = layer.shared_weight_attr
+                    for param in self.shared_layers[
+                            layer.layer_name].parameters():
+                        setattr(param, "is_firstly_shared", True)
 
                 if layer.forward_func is None:
                     self.run_function.append(self.shared_layers[
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
index 912849ffbeb71..71e873b0e2f7c 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py
@@ -53,6 +53,13 @@ def setUp(self):
         }
         fleet.init(is_collective=True, strategy=strategy)
 
+    def build_optimizer(self, model):
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
+                                         parameters=model.parameters())
+        return scheduler, optimizer
+
     def test_pp_model(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
@@ -63,10 +70,7 @@ def test_pp_model(self):
 
         #construct model a
         model_a = AlexNet(10)
-        scheduler_a = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
-        optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a,
-                                           parameters=model_a.parameters())
+        scheduler_a, optimizer_a = self.build_optimizer(model_a)
 
         param_len = len(model_a.parameters())
 
@@ -76,10 +80,7 @@ def test_pp_model(self):
 
         # construct model b
         model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size)
-        scheduler_b = paddle.optimizer.lr.PiecewiseDecay(
-            boundaries=[2], values=[0.001, 0.002], verbose=True)
-        optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b,
-                                           parameters=model_b.parameters())
+        scheduler_b, optimizer_b = self.build_optimizer(model_b)
         model_b = fleet.distributed_model(model_b)
         optimizer_b = fleet.distributed_optimizer(optimizer_b)
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
new file mode 100644
index 0000000000000..de980f3c3f787
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import unittest
+from hybrid_parallel_pp_alexnet import TestDistPPTraning
+
+
+class TestPPClipGrad(TestDistPPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return scheduler, optimizer
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index 7a4f7f9fbd62b..f54aa1bb6e556 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -42,6 +42,9 @@ def test_hybrid_parallel_save_load(self):
     def test_hybrid_parallel_recompute(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py')
 
+    def test_hybrid_parallel_pp_clip_grad(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py')
+
 
 if __name__ == "__main__":
     unittest.main()

From e275e423043e9df51f0e969ffc81e0dc1562aa01 Mon Sep 17 00:00:00 2001
From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com>
Date: Mon, 11 Oct 2021 22:13:17 -0500
Subject: [PATCH 69/80] Add pool2d test convert (#36338)

---
 .../inference/tensorrt/convert/pool2d_op.cc   | 27 ++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  | 41 ++++++++++---------
 .../ir/inference/test_trt_convert_pool2d.py   | 30 +++++++++++---
 3 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 1898f28c73ad0..733a8f64ae5db 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -107,6 +107,9 @@ class Pool2dOpConverter : public OpConverter {
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
     }
 
+    if (padding_algorithm == "VALID") {
+      std::fill(paddings.begin(), paddings.end(), 0);
+    }
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
     nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
     nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
@@ -123,6 +126,30 @@ class Pool2dOpConverter : public OpConverter {
 
     if (engine_->with_dynamic_shape()) {
       if (!adaptive && !global_pooling && !ceil_mode) {
+        auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
+                                                nv_pool_type, nv_ksize);
+        pool_layer->setStride(nv_strides);
+        pool_layer->setPadding(nv_paddings);
+        pool_layer->setAverageCountExcludesPadding(exclusive);
+        if (padding_algorithm == "SAME") {
+          pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
+        }
+        layer = pool_layer;
+      } else if (!adaptive && !global_pooling && ceil_mode) {
+        nvinfer1::DimsHW pre_pad(0, 0);
+        nvinfer1::DimsHW post_pad(0, 0);
+        // If ceil mode is true, we will pad the appropriate size to the input.
+        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
+                     input_dims);
+        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
+            post_pad);
+        PADDLE_ENFORCE_NOT_NULL(
+            pad_layer, platform::errors::Fatal(
+                           "Pad layer in poolOp converter could not be "
+                           "created. The pointer to pad layer is `NULL`."));
+        input1 = pad_layer->getOutput(0);
+
         auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1,
                                                 nv_pool_type, nv_ksize);
         pool_layer->setStride(nv_strides);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 7a70ceda60c1f..ef50aee48e2eb 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -174,22 +174,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     if (op_type == "pool2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
-      if (paddings.size() > 2) return false;
-      if (desc.HasAttr("exclusive")) {
-        if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) {
-          std::vector<int> ksize =
-              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ksize"));
-          for (size_t i = 0; i < ksize.size(); i++) {
-            if (ksize[i] <= paddings[i]) {
-              VLOG(3) << "the padding size should be less than the filter size "
-                         "for exclusive-counting pooling.";
-              return false;
-            }
-          }
-        }
-      }
-      if (desc.HasAttr("ceil_mode")) {
-        if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false;
+      if (paddings.size() > 2) {
+        return false;
       }
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "TRT Pool2d expect 1 input, but got "
@@ -211,15 +197,32 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
                   << pool_type << " pool type.";
           return false;
         }
+        if (pool_type == "avg") {
+          if (desc.HasAttr("global_pooling")) {
+            if (!BOOST_GET_CONST(bool, desc.GetAttr("global_pooling"))) {
+              if (desc.HasAttr("exclusive")) {
+                if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) {
+                  std::vector<int> ksize =
+                      BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ksize"));
+                  for (size_t i = 0; i < ksize.size(); i++) {
+                    if (ksize[i] <= paddings[i]) {
+                      VLOG(3) << "the padding size should be less than the "
+                                 "filter size "
+                                 "for exclusive-counting pooling.";
+                      return false;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
       }
     }
 
     if (op_type == "conv2d" || op_type == "conv2d_transpose" ||
         op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" ||
         op_type == "depthwise_conv2d_transpose") {
-      std::vector<int> paddings =
-          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
-
       if (desc.Input("Input").size() != 1) {
         VLOG(3) << "TRT Conv2d expect 1 input, but got "
                 << desc.Input("Input").size() << " input.";
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index 3e923b1bd89d6..9ec2f83fa5ba0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -21,9 +21,22 @@
 
 
 class TrtConvertPool2dTest(TrtLayerAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+    def is_paddings_valid(self, program_config: ProgramConfig) -> bool:
+        exclusive = program_config.ops[0].attrs['exclusive']
+        paddings = program_config.ops[0].attrs['paddings']
+        ksize = program_config.ops[0].attrs['ksize']
+        pooling_type = program_config.ops[0].attrs['pooling_type']
+        global_pooling = program_config.ops[0].attrs['global_pooling']
+        if global_pooling == False:
+            if pooling_type == 'avg':
+                for index in range(len(ksize)):
+                    if ksize[index] <= paddings[index]:
+                        return False
         return True
 
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return self.is_paddings_valid(program_config)
+
     def sample_program_configs(self):
         self.trt_param.workspace_size = 1073741824
 
@@ -34,7 +47,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
             return np.random.random([24, 3, 3, 3]).astype(np.float32)
 
         for strides in [[1, 1], [2, 2], [1, 2]]:
-            for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]:
+            for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]:
                 for pooling_type in ['max', 'avg']:
                     for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']:
                         for ksize in [[2, 3], [3, 3]]:
@@ -43,7 +56,6 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                                     for exclusive in [True, False]:
                                         for adaptive in [True, False]:
                                             for ceil_mode in [True, False]:
-                                                self.paddings = paddings
 
                                                 dics = [{
                                                     "pooling_type":
@@ -102,9 +114,6 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if self.paddings == [0, 3] or attrs[0][
-                    'global_pooling'] == True or attrs[0]['ceil_mode'] == True:
-                return 0, 3
             return 1, 2
 
         attrs = [
@@ -139,6 +148,15 @@ def teller1(program_config, predictor_config):
         self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
                            "4-dims paddings are not support for trt now.")
 
+        def teller2(program_config, predictor_config):
+            if program_config.ops[0].attrs['global_pooling'] == True:
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
+            "It is not support that global_pooling is true for trt now.")
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()

From 8cc7146d1c53000888b4f6f063aed7db8c9ff922 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Tue, 12 Oct 2021 11:16:31 +0800
Subject: [PATCH 70/80] [NPU] add int64 kernel for slice, test=develop (#36328)

* [NPU] add int64 kernel for scale and slice, test=develop

* remove int64 for scale, test=develop
---
 paddle/fluid/operators/scale_op_npu.cc        |  5 +-
 paddle/fluid/operators/slice_op_npu.cc        | 39 +++++------
 .../tests/unittests/npu/test_slice_op_npu.py  | 64 +++++++++++++++++++
 3 files changed, 80 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 2381719020869..744a9b137f622 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -12,11 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/scale_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index f8bf46da4a638..52351a98bce37 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the Licnse. */
 
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using NPUDeviceContext = platform::NPUDeviceContext;
 
 void UpdateAttr(const framework::DDim& in_dims, const std::vector<int> axes,
                 const std::vector<int> starts, const std::vector<int> ends,
@@ -54,7 +50,7 @@ void UpdateAttr(const framework::DDim& in_dims, const std::vector<int> axes,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class SliceNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -128,17 +124,14 @@ class SliceNPUKernel : public framework::OpKernel<T> {
 
     UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
 
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
     const auto& runner = NpuOpRunner("SliceD", {*input}, {*out},
                                      {{"offsets", offsets}, {"size", size}});
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
     runner.Run(stream);
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class SliceGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -221,15 +214,13 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_NPU_KERNEL(
-    slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>);
-
-REGISTER_OP_NPU_KERNEL(
-    slice_grad,
-    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
-                            paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(slice, ops::SliceNPUKernel<float>,
+                       ops::SliceNPUKernel<int>,
+#ifdef PADDLE_WITH_ASCEND_INT64
+                       ops::SliceNPUKernel<int64_t>,
+#endif
+                       ops::SliceNPUKernel<paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(slice_grad, ops::SliceGradNPUKernel<float>,
+                       ops::SliceGradNPUKernel<int>,
+                       ops::SliceGradNPUKernel<paddle::platform::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index 055c3015f82f5..611691109e187 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -527,5 +527,69 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
+class TestSliceOpInt64(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.randint(
+            100, size=(3, 4, 5, 6)).astype(self.dtype)
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def init_dtype(self):
+        self.dtype = np.int64
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestSliceOpTensorInt64(TestSliceOpInt64):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_npu()
+        self.init_dtype()
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            'StartsTensor': self.starts,
+            'EndsTensor': self.ends
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': [-1, -1, -1],
+            'ends': [-1, -1, -1],
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.randint(
+            100, size=(3, 4, 5, 6)).astype(self.dtype)
+        self.starts = np.array([1, 0, 2]).astype('int32')
+        self.ends = np.array([3, 3, 4]).astype('int32')
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+
 if __name__ == '__main__':
     unittest.main()

From 1d660eb6767b990f8a5760e7b766a880f88d2d03 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 17:42:25 +0800
Subject: [PATCH 71/80] Fix the bug when axis is specified and weight is
 provided

---
 .../unittests/test_cross_entropy_loss.py      | 48 +++++++++++++++++++
 python/paddle/nn/functional/loss.py           | 46 +++++++++++-------
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index d2eae1cce5bcb..6a0d955040f35 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1175,6 +1175,54 @@ def test_cross_entropy_loss_2d_with_weight_none(self):
         self.assertTrue(np.allclose(static_ret, expected))
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
+    def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self):
+        input_np = np.random.random(size=(2, 3, 2, 2)).astype(self.dtype)  #NCHW
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        weight_np = np.random.random(size=(3, )).astype(self.dtype)  #C
+
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[2, 3, 2, 2], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
+            weight = fluid.data(name='weight', shape=[3], dtype=self.dtype)
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, reduction='mean', axis=1)
+            # specify the class channels to axis 1
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+
+            self.assertIsNotNone(static_ret)
+        with fluid.dygraph.guard():
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=fluid.dygraph.to_variable(weight_np), reduction='mean')
+            dy_ret = cross_entropy_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            np.transpose(input_np, [0, 2, 3, 1]),
+            label_np,
+            weight=weight_np,
+            reduction='mean')[0]
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
     def test_cross_entropy_loss_2d_with_weight_mean_ignore_exceedlabel(self):
         N = 4
         C = 3
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index da2d010c323b5..f4e8711a231e4 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1700,19 +1700,26 @@ def cross_entropy(input,
                 out = _C_ops.elementwise_mul(out, weight_gather_reshape)
 
             else:
-                if input.shape[-1] != weight.shape[-1]:
+                if input.shape[axis] != weight.shape[-1]:
                     raise ValueError(
-                        "input's class_dimension({}) must equal to \
-                        weight's class_dimension({}) \
-                            when weight is provided"
-                        .format(input.shape[-1], weight.shape[-1]))
+                        "input's class_dimension({}) must equal to "
+                        "weight's class_dimension({}) "
+                            "when weight is provided"\
+                        .format(input.shape[axis], weight.shape[-1]))
 
                 ignore_weight_mask = paddle.cast((label != ignore_index),
                                                  out.dtype)
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                        -1] == 1:
-                    ignore_weight_mask.squeeze_(-1)
-                weight_gather = _C_ops.gather_nd(weight, valid_label)
+                        axis] == 1:
+                    ignore_weight_mask.squeeze_(axis)
+                if axis != -1:
+                    temp_perm = list(range(axis % valid_label.ndim)) \
+                                + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \
+                                + [axis%valid_label.ndim]
+                    weight_gather = _C_ops.gather_nd(
+                        weight, valid_label.transpose(temp_perm))
+                else:
+                    weight_gather = _C_ops.gather_nd(weight, valid_label)
                 weight_gather = _C_ops.elementwise_mul(weight_gather,
                                                        ignore_weight_mask)
                 input_shape = list(label.shape)
@@ -1807,20 +1814,27 @@ def cross_entropy(input,
             weight_gather_reshape = reshape(weight_gather, shape=out_shape)
             out = paddle.cast(out, weight_gather_reshape.dtype)
         else:
-            if input.shape[-1] != weight.shape[-1]:
-                raise ValueError("input's class_dimension({}) must equal to "\
-                        "weight's class_dimension({}) "\
-                            "when weight is provided"
-                                 .format(input.shape[-1], weight.shape[-1]))
+            if input.shape[axis] != weight.shape[-1]:
+                raise ValueError("input's class_dimension({}) must equal to "
+                        "weight's class_dimension({}) "
+                            "when weight is provided"\
+                                 .format(input.shape[axis], weight.shape[-1]))
 
             valid_label = paddle.where(label == ignore_index,
                                        paddle.zeros_like(label), label)
             ignore_weight_mask = paddle.cast((label != ignore_index),
                                              input.dtype)
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                    -1] == 1:
-                ignore_weight_mask = paddle.squeeze(ignore_weight_mask, -1)
-            weight_gather = paddle.gather_nd(weight, valid_label)
+                    axis] == 1:
+                ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
+            if axis != -1:
+                temp_perm = list(range(axis % valid_label.ndim)) \
+                            + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \
+                            + [axis % valid_label.ndim]
+                weight_gather = paddle.gather_nd(
+                    weight, paddle.transpose(valid_label, temp_perm))
+            else:
+                weight_gather = paddle.gather_nd(weight, valid_label)
             weight_gather = paddle.multiply(weight_gather, ignore_weight_mask)
 
             input_shape = list(label.shape)

From 8c2fbc3138ff4e17c451cabe605f7f22571d6aaf Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 18:35:39 +0800
Subject: [PATCH 72/80] Update loss.py

---
 python/paddle/nn/functional/loss.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f4e8711a231e4..f8e03e476d7f0 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1668,12 +1668,12 @@ def cross_entropy(input,
                     format(invalid_label[0], 0))
             # TODO: Temporarily use paddle.nonzero instead of paddle.max 
             # to detect and find out possible illegal label values
-            if len(paddle.nonzero(valid_label >= input.shape[-1])) > 0:
+            if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
                 invalid_label = paddle.gather_nd(
-                    valid_label, paddle.nonzero(valid_label >= input.shape[-1]))
+                    valid_label, paddle.nonzero(valid_label >= input.shape[axis]))
                 raise ValueError(
                     "Target({}) is out of class_dimension's upper bound({})".
-                    format(invalid_label[0], input.shape[-1] - 1))
+                    format(invalid_label[0], input.shape[axis] - 1))
 
         _, out = _C_ops.softmax_with_cross_entropy(
             input, label, 'soft_label', soft_label, 'ignore_index',

From 53dc0143377552418f1c4db39c5a388a75fd52f8 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 20:36:26 +0800
Subject: [PATCH 73/80] Update loss.py

---
 python/paddle/nn/functional/loss.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index f8e03e476d7f0..5bb317cf3e746 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1712,9 +1712,9 @@ def cross_entropy(input,
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                         axis] == 1:
                     ignore_weight_mask.squeeze_(axis)
-                if axis != -1:
+                if axis != -1 and axis != valid_label.ndim - 1:
                     temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \
+                                + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \
                                 + [axis%valid_label.ndim]
                     weight_gather = _C_ops.gather_nd(
                         weight, valid_label.transpose(temp_perm))

From 3675f25df2d176e558a6d6f3179e0879b6f7c9a6 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 21:10:55 +0800
Subject: [PATCH 74/80] Update loss.py

---
 python/paddle/nn/functional/loss.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 5bb317cf3e746..eb043c005663a 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1670,7 +1670,8 @@ def cross_entropy(input,
             # to detect and find out possible illegal label values
             if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0:
                 invalid_label = paddle.gather_nd(
-                    valid_label, paddle.nonzero(valid_label >= input.shape[axis]))
+                    valid_label,
+                    paddle.nonzero(valid_label >= input.shape[axis]))
                 raise ValueError(
                     "Target({}) is out of class_dimension's upper bound({})".
                     format(invalid_label[0], input.shape[axis] - 1))

From 6cd41cec2146da2f5008a42e972a4627a4deb26d Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 22:15:05 +0800
Subject: [PATCH 75/80] Update loss.py

---
 python/paddle/nn/functional/loss.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index eb043c005663a..38d4da17cbefa 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1712,11 +1712,12 @@ def cross_entropy(input,
                                                  out.dtype)
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                         axis] == 1:
-                    ignore_weight_mask.squeeze_(axis)
+                    # TODO: Temporarily use squeeze instead of squeeze_
+                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
                 if axis != -1 and axis != valid_label.ndim - 1:
                     temp_perm = list(range(axis % valid_label.ndim)) \
                                 + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \
-                                + [axis%valid_label.ndim]
+                                + [axis % valid_label.ndim]
                     weight_gather = _C_ops.gather_nd(
                         weight, valid_label.transpose(temp_perm))
                 else:
@@ -1828,9 +1829,9 @@ def cross_entropy(input,
             if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                     axis] == 1:
                 ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
-            if axis != -1:
+            if axis != -1 and axis != valid_label.ndim - 1:
                 temp_perm = list(range(axis % valid_label.ndim)) \
-                            + list(range((axis + 1) % valid_label.ndim, valid_label.ndim)) \
+                            + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
                             + [axis % valid_label.ndim]
                 weight_gather = paddle.gather_nd(
                     weight, paddle.transpose(valid_label, temp_perm))

From a4246b90646101f8dd7734d2d8ee5ce8106b67a8 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 23:13:41 +0800
Subject: [PATCH 76/80] Update test_cross_entropy_loss.py

---
 python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index 6a0d955040f35..c4be262e93029 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1208,7 +1208,7 @@ def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
-                weight=fluid.dygraph.to_variable(weight_np), reduction='mean')
+                weight=fluid.dygraph.to_variable(weight_np), reduction='mean', axis=1)
             dy_ret = cross_entropy_loss(
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))

From 59841e6f324e3a0fe49b047bdff1e425a67497fb Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 23:44:26 +0800
Subject: [PATCH 77/80] Update test_cross_entropy_loss.py

---
 .../paddle/fluid/tests/unittests/test_cross_entropy_loss.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index c4be262e93029..d3ed76e34a614 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -1208,7 +1208,9 @@ def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self):
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
-                weight=fluid.dygraph.to_variable(weight_np), reduction='mean', axis=1)
+                weight=fluid.dygraph.to_variable(weight_np),
+                reduction='mean',
+                axis=1)
             dy_ret = cross_entropy_loss(
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))

From f77083bbbc6f559bebee42ec12d42a37472dc8c4 Mon Sep 17 00:00:00 2001
From: HydrogenSulfate <490868991@qq.com>
Date: Mon, 11 Oct 2021 23:45:13 +0800
Subject: [PATCH 78/80] Update loss.py

---
 python/paddle/nn/functional/loss.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 38d4da17cbefa..b1db45ad50669 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1713,7 +1713,8 @@ def cross_entropy(input,
                 if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
                         axis] == 1:
                     # TODO: Temporarily use squeeze instead of squeeze_
-                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
+                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask,
+                                                        axis)
                 if axis != -1 and axis != valid_label.ndim - 1:
                     temp_perm = list(range(axis % valid_label.ndim)) \
                                 + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \

From b3f6eedb77925c28a193eaedb858220b9417c5ca Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 12 Oct 2021 12:55:02 +0800
Subject: [PATCH 79/80] refine LarsOptimizer (#36351)

---
 python/paddle/fluid/optimizer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 24076e82b0365..4625d7ea89b25 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -2047,11 +2047,15 @@ def _create_accumulators(self, block, parameters):
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
         _lars_weight_decay = self._lars_weight_decay
+        _lars_coeff = self._lars_coeff
         param_name = param_and_grad[0].name
+        is_excluded = False
         if len(self._exclude_from_weight_decay) > 0:
             for name in self._exclude_from_weight_decay:
                 if name in param_name:
                     _lars_weight_decay = 0.0
+                    _lars_coeff = 0.0
+                    is_excluded = True
                     break
 
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
@@ -2065,7 +2069,7 @@ def _append_optimize_op(self, block, param_and_grad):
 
         attrs = {
             "mu": self._momentum,
-            "lars_coeff": self._lars_coeff,
+            "lars_coeff": _lars_coeff,
             "lars_weight_decay": _lars_weight_decay,
             "multi_precision": find_master,
             "rescale_grad": self._rescale_grad
@@ -2086,7 +2090,7 @@ def _append_optimize_op(self, block, param_and_grad):
 
         # create the momentum optimize op
         momentum_op = block.append_op(
-            type=self.type,
+            type='momentum' if is_excluded else self.type,
             inputs=inputs,
             outputs=outputs,
             attrs=attrs,

From 09778f464956a450491d5ade3ef79586d61403ca Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Tue, 12 Oct 2021 13:31:12 +0800
Subject: [PATCH 80/80] [NPU] fix elementwise_mul to support broadcast,
 test=develop (#36258)

* [NPU] fix elementwise_mul to support broadcast, test=develop

* remove debug files, test=develop

* add axis support, test=develop
---
 .../elementwise/elementwise_mul_op_npu.cc     | 132 ++++++---
 .../npu/test_elementwise_mul_op_npu.py        | 274 +++++++++++-------
 2 files changed, 258 insertions(+), 148 deletions(-)

diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index 47aa7e2521f76..b2030ad21e8d1 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <string>
-
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-
-template <typename DeviceContext, typename T>
+using NPUDeviceContext = platform::NPUDeviceContext;
+
+template <typename T>
+static void ReduceDims(const framework::ExecutionContext& ctx,
+                       const aclrtStream& stream, const int axis,
+                       const framework::DDim& ddims,
+                       const framework::DDim& brd_ddims, const Tensor& in,
+                       Tensor* out) {
+  std::vector<int64_t> axes;
+  int64_t brd_size = brd_ddims.size();
+  int64_t org_size = ddims.size();
+  // int64_t diff = brd_dims.size() - dims.size();
+  for (int64_t i = 0; i < brd_size; ++i) {
+    if (i < axis || i >= org_size + axis) {
+      axes.push_back(i);
+      continue;
+    }
+    if (brd_ddims[i] > ddims[i - axis]) {
+      axes.push_back(i);
+    }
+  }
+  // LOG(INFO) << "axes = " << framework::make_ddim(axes).to_str();
+  out->mutable_data<T>(ctx.GetPlace());
+  const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out},
+                                   {{"axes", axes}, {"keep_dims", false}});
+  runner.Run(stream);
+}
+
+template <typename T>
 class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
-
     auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    int axis = ctx.Attr<int>("axis");
+
+    bool direct_compute = false;
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+    if (x_dims.size() >= y_dims.size()) {
+      direct_compute = x_dims.size() == (y_dims.size() + axis);
+    } else {
+      direct_compute = y_dims.size() == (x_dims.size() + axis);
+    }
 
-    auto place = ctx.GetPlace();
-
-    out->mutable_data<T>(place);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
-    runner.Run(stream);
+    if (direct_compute) {
+      const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
+      runner.Run(stream);
+    } else {
+      Tensor trans_x, trans_y;
+      NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
+      const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {});
+      runner.Run(stream);
+    }
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<NPUDeviceContext>();
     auto* x = ctx.Input<Tensor>("X");
     auto* y = ctx.Input<Tensor>("Y");
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
 
-    auto place = ctx.GetPlace();
+    axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
+    auto stream = ctx.template device_context<NPUDeviceContext>().stream();
 
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
+    Tensor trans_x, trans_y;
+    NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &trans_x, &trans_y);
 
     if (dx) {
-      dx->mutable_data<T>(place);
-      const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
-      runner_dx.Run(stream);
+      if (dx->dims() == dout->dims()) {
+        dx->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {});
+        runner_dx.Run(stream);
+      } else {
+        Tensor dx_temp(x->type());
+        dx_temp.Resize(trans_x.dims());
+        dx_temp.mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dx =
+            NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {});
+        runner_dx.Run(stream);
+        ReduceDims<T>(ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp,
+                      dx);
+      }
     }
-
     if (dy) {
-      dy->mutable_data<T>(place);
-      const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
-      runner_dy.Run(stream);
+      if (dy->dims() == dout->dims()) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {});
+        runner_dy.Run(stream);
+      } else {
+        Tensor dy_temp(y->type());
+        dy_temp.Resize(trans_y.dims());
+        dy_temp.mutable_data<T>(ctx.GetPlace());
+        const auto& runner_dy =
+            NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {});
+        runner_dy.Run(stream);
+        ReduceDims<T>(ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp,
+                      dy);
+      }
     }
   }
 };
@@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_mul,
-    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMulNPUKernel<paddle::platform::NPUDeviceContext,
-                                 paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel<float>,
+                       ops::ElementwiseMulNPUKernel<paddle::platform::float16>);
 
 REGISTER_OP_NPU_KERNEL(
-    elementwise_mul_grad,
-    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ElementwiseMulGradNPUKernel<paddle::platform::NPUDeviceContext,
-                                     paddle::platform::float16>);
-#endif
+    elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel<float>,
+    ops::ElementwiseMulGradNPUKernel<paddle::platform::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
index ea94661e8a51e..92bbc9f536d13 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -18,147 +18,203 @@
 import unittest
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
 
 paddle.enable_static()
-SEED = 2021
 
 
-class TestElementwiseMul(OpTest):
+class ElementwiseMulOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
-        self.place = paddle.NPUPlace(0)
-
+        self.dtype = np.float32
+        self.axis = -1
         self.init_dtype()
-        np.random.seed(SEED)
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.multiply(x, y)
+        self.init_input_output()
+        self.init_axis()
 
         self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
         }
-        self.attrs = {}
-        self.outputs = {'Out': out}
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis}
 
-    def set_npu(self):
-        self.__class__.use_npu = True
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place, ['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', no_grad_set=set('Y'))
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
+        self.out = np.multiply(self.x, self.y)
 
     def init_dtype(self):
-        self.dtype = np.float32
+        pass
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place)
+    def init_axis(self):
+        pass
 
-    # TODO(ascendrc): Mul grad test
-    # def test_check_grad(self):
-    #     if self.dtype == np.float16:
-    #         return
-    #     self.check_grad(['X'], 'Out')
-    #
+
+@skip_check_grad_ci(
+    reason="[skip shape check] Use y_shape(1) to test broadcast.")
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
 
 
-class TestElementwiseMulFp16(OpTest):
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.set_npu()
         self.op_type = "elementwise_mul"
-        self.place = paddle.NPUPlace(0)
+        self.inputs = {
+            'X': np.random.random((100, )).astype("float32"),
+            'Y': np.random.random((100, )).astype("float32")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
-        self.init_dtype()
-        np.random.seed(SEED)
-        x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
-        y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
-        out = np.multiply(x, y)
 
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def init_input_output(self):
+        self.x = np.random.rand(100, 2, 3).astype(self.dtype)
+        self.y = np.random.rand(100).astype(self.dtype)
+        self.out = self.x * self.y.reshape(100, 1, 1)
+
+    def init_axis(self):
+        self.axis = 0
+
+
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(y)
+            'X': np.random.rand(2, 100, 3).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1)
+        }
+
+
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(100).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100)
         }
-        self.attrs = {}
-        self.outputs = {'Out': out}
 
-    def set_npu(self):
-        self.__class__.use_npu = True
-        self.__class__.no_need_check_grad = True
 
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 10, 12, 3).astype(np.float32),
+            'Y': np.random.rand(10, 12).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1)
+        }
+
+
+class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 2, 11).astype(np.float32),
+            'Y': np.random.rand(10, 1, 11).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 4, 2, 3).astype(np.float32),
+            'Y': np.random.rand(10, 4, 1, 3).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "paddle is not compiled with NPU")
+class TestElementwiseMulOpFp16(ElementwiseMulOp):
     def init_dtype(self):
         self.dtype = np.float16
 
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5)
-
-
-class TestElementwiseMulNet(unittest.TestCase):
-    def _test(self, run_npu=True):
-        main_prog = paddle.static.Program()
-        startup_prog = paddle.static.Program()
-        main_prog.random_seed = SEED
-        startup_prog.random_seed = SEED
-        np.random.seed(SEED)
-
-        a_np = np.random.random(size=(32, 32)).astype('float32')
-        b_np = np.random.random(size=(32, 32)).astype('float32')
-        c_np = np.random.random(size=(32, 32)).astype('float32')
-        d_np = np.random.random(size=(32, 32)).astype('float32')
-        label_np = np.random.randint(2, size=(32, 1)).astype('int64')
-
-        with paddle.static.program_guard(main_prog, startup_prog):
-            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
-            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            c = paddle.static.data(name="c", shape=[32, 32], dtype='float32')
-            d = paddle.static.data(name="d", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(
-                name="label", shape=[32, 1], dtype='int64')
-
-            e = paddle.multiply(a, b)
-            f = paddle.multiply(c, d)
-            f.stop_gradient = True
-            g = paddle.multiply(e, f)
-
-            fc_1 = fluid.layers.fc(input=g, size=128)
-            prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
-
-            cost = fluid.layers.cross_entropy(input=prediction, label=label)
-            loss = fluid.layers.reduce_mean(cost)
-            sgd = fluid.optimizer.SGD(learning_rate=0.01)
-            sgd.minimize(loss)
-
-        if run_npu:
-            place = paddle.NPUPlace(0)
-        else:
-            place = paddle.CPUPlace()
-
-        exe = paddle.static.Executor(place)
-        exe.run(startup_prog)
-
-        print("Start run on {}".format(place))
-        for epoch in range(100):
-
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "c": c_np,
-                                             "d": d_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
-            if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
-
-        return pred_res, loss_res
-
-    def test_npu(self):
-        cpu_pred, cpu_loss = self._test(False)
-        npu_pred, npu_loss = self._test(True)
-
-        self.assertTrue(np.allclose(npu_pred, cpu_pred))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 100).astype(np.float32),
+            'Y': np.random.rand(1, 1, 100).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(30, 3, 1, 5).astype(np.float32),
+            'Y': np.random.rand(30, 1, 4, 1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp):
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(10, 10).astype(np.float32),
+            'Y': np.random.rand(2, 2, 10, 10).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 2}
+
+        self.outputs = {
+            'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y']
+        }
 
 
 if __name__ == '__main__':