diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 42de34fb52061..640e2e37ad434 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -35,7 +35,7 @@ ELSE ()
 ENDIF()
 
 SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701")
+SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729")
 SET(XPU_XRE_URL  "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 485fddff4df42..08e912f52ccb5 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -188,8 +188,13 @@ cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
 
 cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_vars_inference)
 
+IF(WITH_XPU)
+cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
     shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op_registry device_context)
@@ -405,7 +410,7 @@ configure_file(commit.h.in commit.h)
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include)
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include)
 
 if(WITH_ROCM)
   hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce)
diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h
index 7e2c41dd4f795..82ce045fad723 100644
--- a/paddle/fluid/framework/details/multi_devices_helper.h
+++ b/paddle/fluid/framework/details/multi_devices_helper.h
@@ -77,10 +77,6 @@ typedef std::vector<std::pair<std::string, std::string>> ParamsAndGrads;
 constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads";
 constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads";
 
-typedef std::vector<ProgramDesc> ProgramDescs;
-constexpr char kProgramDescs[] = "program_descs";
-constexpr char kStartupProgramDescs[] = "startup_program_descs";
-
 typedef std::unordered_set<std::string> PinnedVars;
 constexpr char kPinnedVars[] = "pinned_vars";
 
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 0a856330f8e74..652ce77d84457 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include <queue>
 #include <stack>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
+DECLARE_bool(convert_all_blocks);
 DEFINE_string(print_sub_graph_dir, "",
               "FLAGS_print_sub_graph_dir is used "
               "to print the nodes of sub_graphs.");
@@ -431,6 +433,117 @@ std::vector<ir::Node *> TopologySortGraphByDescOrder(const Graph &graph) {
   return ret;
 }
 
+static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) {
+  desc->SetType("fill_constant");
+  desc->SetAttr(
+      OpProtoAndCheckerMaker::OpRoleAttrName(),
+      (static_cast<int>(OpRole::kBackward) | static_cast<int>(OpRole::kLoss)));
+  desc->SetAttr("value", 1.0f);
+  std::vector<std::string> output_names;
+  for (auto out : node.outputs) {
+    output_names.emplace_back(out->Name());
+  }
+  desc->SetOutput("Out", output_names);
+  return desc;
+}
+
+static void GetGraphOpDesc(const std::vector<Node *> &nodes,
+                           std::vector<OpDesc> *ops) {
+  for (Node *n : nodes) {
+    // if node is not Op, skip
+    if (!n->IsOp()) continue;
+
+    // create fill_constant op
+    if (n->Name() == "scale_loss_grad") {
+      ops->emplace_back();
+      auto &desc = ops->back();
+      ReplaceScaleLossGradOp(*n, &desc);
+    } else if (n->Op()) {
+      ops->emplace_back(*n->Op());
+    }
+    // delete no OpDesc op
+  }
+}
+
+static void GraphToBlock(const Graph &graph, proto::BlockDesc *block,
+                         const SortKind *sort_kind) {
+  // Remove the unneeded variables after memory optimization.
+  std::unordered_set<std::string> vars2remove;
+  if (graph.Has(kGraphToProgramVarsToRemove)) {
+    vars2remove =
+        graph.Get<std::unordered_set<std::string>>(kGraphToProgramVarsToRemove);
+    VLOG(2) << "graph (id: " << block->idx() << ") to program remove "
+            << vars2remove.size() << " nodes";
+  }
+
+  block->clear_vars();
+  std::unordered_set<std::string> visited_vars;
+  for (Node *n : graph.Nodes()) {
+    if (n->IsVar()) {
+      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
+          !vars2remove.count(n->Var()->Name()) &&
+          n->GetVarNodeBlockId() == graph.GetBlockId()) {
+        visited_vars.insert(n->Var()->Name());
+        block->add_vars()->MergeFrom(*n->Var()->Proto());
+      }
+    }
+  }
+  block->clear_ops();
+
+  std::vector<Node *> nodes;
+  if (sort_kind != nullptr) {
+    // Inference Memory Optimize relays on this branch.
+    nodes = TopologyVarientSort(graph, *sort_kind);
+  } else {
+    if (FLAGS_convert_all_blocks) {
+      nodes = TopologySortGraphByDescOrder(graph);
+    } else {
+      nodes = TopologySortOperations(graph);
+    }
+  }
+
+  std::vector<OpDesc> ops;
+  GetGraphOpDesc(nodes, &ops);
+  for (auto &op : ops) {
+    block->add_ops()->MergeFrom(*op.Proto());
+  }
+}
+
+void GraphToProgram(const Graph &graph, ProgramDesc *program,
+                    const SortKind *sort_kind) {
+  PADDLE_ENFORCE_EQ(graph.IsMainGraph(), true,
+                    platform::errors::InvalidArgument(
+                        "This graph is a sub_graph, "
+                        "and can't convert to program individually"));
+  PADDLE_ENFORCE_NOT_NULL(
+      program,
+      platform::errors::InvalidArgument(
+          "program must not be nullptr when converting graph to program"));
+
+  proto::ProgramDesc program_pb(*(program->Proto()));
+  auto block = program_pb.mutable_blocks(kRootBlockIndex);
+  block->set_idx(kRootBlockIndex);
+
+  if (FLAGS_convert_all_blocks) {
+    GraphToBlock(*graph.GetSubGraph(kRootBlockIndex), block, sort_kind);
+
+    VLOG(3) << "Graph to program need convert " << graph.SubGraphsSize()
+            << " sub graph";
+    for (size_t idx = 0; idx < graph.SubGraphsSize(); ++idx) {
+      // avoid kRootBlockIndex not 0
+      if (idx == kRootBlockIndex) continue;
+
+      block = program_pb.add_blocks();
+      block->set_idx(idx);
+      GraphToBlock(*graph.GetSubGraph(idx), block, sort_kind);
+    }
+  } else {
+    GraphToBlock(graph, block, sort_kind);
+  }
+
+  program->CopyFrom(program_pb);
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 3309f600730e8..f00e3ae37b4da 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -27,6 +27,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+constexpr char kGraphToProgramVarsToRemove[] =
+    "__graph_to_program_vars_to_remove__";
+constexpr char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
+
 // Compare nodes via node id.
 class Graph;
 
@@ -117,6 +121,9 @@ std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
 
 std::vector<ir::Node *> TopologySortGraphByDescOrder(const Graph &graph);
 
+void GraphToProgram(const Graph &graph, ProgramDesc *p_program,
+                    const SortKind *sort_kind = nullptr);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc
index b31ccd48aa98b..3ad591c6dff04 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.cc
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc
@@ -17,11 +17,8 @@ limitations under the License. */
 #include <gflags/gflags.h>
 #include <algorithm>
 
-#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
 
-DECLARE_bool(convert_all_blocks);
-
 namespace paddle {
 namespace framework {
 class ProgramDesc;
@@ -33,116 +30,12 @@ namespace framework {
 namespace ir {
 
 void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_EQ(graph->IsMainGraph(), true,
-                    platform::errors::InvalidArgument(
-                        "This graph is a sub_graph, "
-                        "and can't convert to program individually"));
-
-  ProgramDesc& program = Get<ProgramDesc>("program");
-
-  std::unique_ptr<proto::ProgramDesc> program_pb(
-      new proto::ProgramDesc(*program.Proto()));
-
-  auto block = program_pb->mutable_blocks(kRootBlockIndex);
-  block->set_idx(kRootBlockIndex);
-
-  if (FLAGS_convert_all_blocks) {
-    GraphToBlock(graph->GetSubGraph(kRootBlockIndex), block);
-
-    VLOG(3) << "Graph to program need convert " << graph->SubGraphsSize()
-            << " sub graph";
-    for (size_t idx = 0; idx < graph->SubGraphsSize(); ++idx) {
-      // avoid kRootBlockIndex not 0
-      if (idx == kRootBlockIndex) continue;
-
-      block = program_pb->add_blocks();
-      block->set_idx(idx);
-      GraphToBlock(graph->GetSubGraph(idx), block);
-    }
-  } else {
-    GraphToBlock(graph, block);
-  }
-
-  program.CopyFrom(*program_pb);
-}
-
-OpDesc* ReplaceScaleLossGradOp(ir::Node* node, OpDesc* desc) {
-  desc->SetType("fill_constant");
-  desc->SetAttr(
-      OpProtoAndCheckerMaker::OpRoleAttrName(),
-      (static_cast<int>(OpRole::kBackward) | static_cast<int>(OpRole::kLoss)));
-  desc->SetAttr("value", 1.0f);
-  std::vector<std::string> output_names;
-  for (auto out : node->outputs) {
-    output_names.emplace_back(out->Name());
-  }
-  desc->SetOutput("Out", output_names);
-  return desc;
-}
-
-std::vector<OpDesc>* GetGraphOpDesc(const std::vector<ir::Node*>& nodes,
-                                    std::vector<OpDesc>* ops) {
-  for (ir::Node* n : nodes) {
-    // if node is not Op, skip
-    if (!n->IsOp()) continue;
-
-    // create fill_constant op
-    if (n->Name() == "scale_loss_grad") {
-      ops->emplace_back();
-      auto& desc = ops->back();
-      ReplaceScaleLossGradOp(n, &desc);
-    } else if (n->Op()) {
-      ops->emplace_back(*n->Op());
-    } else {
-      // delete no OpDesc op
-    }
-  }
-  return ops;
-}
-
-void GraphToProgramPass::GraphToBlock(const Graph* graph,
-                                      proto::BlockDesc* block) const {
-  // Remove the unneeded variables after memory optimization.
-  std::unordered_set<std::string> vars2remove;
-  if (graph->Has(kGraphToProgramVarsToRemove)) {
-    vars2remove = graph->Get<std::unordered_set<std::string>>(
-        kGraphToProgramVarsToRemove);
-    VLOG(2) << "graph (id: " << block->idx() << ") to program remove "
-            << vars2remove.size() << " nodes";
-  }
-
-  block->clear_vars();
-  std::unordered_set<std::string> visited_vars;
-  for (ir::Node* n : graph->Nodes()) {
-    if (n->IsVar()) {
-      if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
-          !vars2remove.count(n->Var()->Name()) &&
-          n->GetVarNodeBlockId() == graph->GetBlockId()) {
-        visited_vars.insert(n->Var()->Name());
-        block->add_vars()->MergeFrom(*n->Var()->Proto());
-      }
-    }
-  }
-  block->clear_ops();
-
-  std::vector<ir::Node*> nodes;
+  auto& program = Get<ProgramDesc>("program");
   if (Has(kGraphToProgramSortKind)) {
-    // Inference Memory Optimize relays on this branch.
-    int sort_kind = Get<int>(kGraphToProgramSortKind);
-    nodes = TopologyVarientSort(
-        *graph, static_cast<framework::ir::SortKind>(sort_kind));
+    auto sort_kind = static_cast<SortKind>(Get<int>(kGraphToProgramSortKind));
+    GraphToProgram(*graph, &program, &sort_kind);
   } else {
-    if (FLAGS_convert_all_blocks) {
-      nodes = TopologySortGraphByDescOrder(*graph);
-    } else {
-      nodes = TopologySortOperations(*graph);
-    }
-  }
-
-  std::vector<OpDesc> ops;
-  GetGraphOpDesc(nodes, &ops);
-  for (auto& op : ops) {
-    block->add_ops()->MergeFrom(*op.Proto());
+    GraphToProgram(*graph, &program, nullptr);
   }
 }
 
diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h
index 4997c67a92fdc..3789a0a623df2 100644
--- a/paddle/fluid/framework/ir/graph_to_program_pass.h
+++ b/paddle/fluid/framework/ir/graph_to_program_pass.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
@@ -22,16 +23,9 @@ namespace ir {
 
 class Graph;
 
-const char kGraphToProgramVarsToRemove[] =
-    "__graph_to_program_vars_to_remove__";
-const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
-
 class GraphToProgramPass : public Pass {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
-
- private:
-  void GraphToBlock(const Graph* graph, proto::BlockDesc* block) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 0e5f5867f47b2..42b6244788da0 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -69,6 +69,26 @@ Graph* Pass::Apply(Graph* graph) const {
   return graph;
 }
 
+void Pass::Apply(ProgramDesc* main_program,
+                 ProgramDesc* startup_program) const {
+  PADDLE_ENFORCE_NOT_NULL(main_program, platform::errors::InvalidArgument(
+                                            "main program must be provided"));
+  PADDLE_ENFORCE_NOT_NULL(
+      startup_program,
+      platform::errors::InvalidArgument("startup program must be provided"));
+
+  Graph graph(*main_program);
+  Apply(&graph);
+
+  // TODO(zjl): support details::kStartupProgramDescs and details::kProgramDescs
+  ProgramDesc new_main_program;
+  GraphToProgram(graph, &new_main_program);
+  main_program->CopyFrom(*new_main_program.Proto());
+
+  startup_program->Flush();
+  main_program->Flush();
+}
+
 PassRegistry& PassRegistry::Instance() {
   static PassRegistry g_pass_info_map;
   return g_pass_info_map;
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 9c306479bf5d6..8fb96bec9cbd5 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -29,8 +29,15 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+namespace details {
+using ProgramDescs = std::vector<ProgramDesc>;
+constexpr char kProgramDescs[] = "program_descs";
+constexpr char kStartupProgramDescs[] = "startup_program_descs";
+}  // namespace details
+
 namespace ir {
 class Graph;
+
 template <typename PassType>
 struct PassRegistrar;
 
@@ -57,6 +64,8 @@ class Pass {
 
   Graph *Apply(Graph *graph) const;
 
+  void Apply(ProgramDesc *main_program, ProgramDesc *startup_program) const;
+
   // Get a reference to the attributed previously set.
   template <typename AttrType>
   AttrType &Get(const std::string &attr_name) const {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 20cffaa959019..0f7012940d76b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -36,7 +36,8 @@ class LoDTensor;
 }  // namespace framework
 }  // namespace paddle
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -1254,7 +1255,8 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
 #endif
 #ifdef PADDLE_WITH_XPU
   if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_)) {
+      is_xpu_place(expected_kernel_key.place_) &&
+      !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) {
     VLOG(3) << "missing XPU kernel: " << type_
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index ac53eab64917f..f4bbbaa2e70cf 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -138,6 +138,35 @@ void TensorFromArray(const T* src, const size_t& array_size,
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  else if (platform::is_npu_place(dst_place)) {  // NOLINT
+    //  1. vector -> npu pinned tensor
+    platform::NPUPinnedPlace npu_pinned_place;
+    Tensor npu_pinned_tensor;
+    npu_pinned_tensor.Resize(dst->dims());
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type());
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
+
+    //  2. async copy npu pinned tensor -> npu tensor
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+        npu_pinned_place, npu_pinned_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+
+    //  3. record event
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation* allocation =
+        npu_pinned_tensor.Holder().get();
+    npu_pinned_allocator->RecordEvent(
+        allocation,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+  }
+#endif
 }
 
 template <typename T>
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index c9dffe2d76a43..93b18e7e553b5 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,6 +1,10 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags)
 
+IF(WITH_XPU)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+ELSE()
 cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
 cc_library(amp SRCS amp_auto_cast.cc DEPS layer )
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 84ee1fbe5df96..75659298ea764 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -49,11 +49,17 @@ void BasicEngine::Init(
           "the size of tensors is %s, but the size of grad_tensors is %s.",
           tensors.size(), grad_tensors.size()));
 
+  PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
+                    platform::errors::AlreadyExists(
+                        "Accumulators are not empty before preparing it for "
+                        "backward network execution."));
+
   for (size_t i = 0; i < tensors.size(); ++i) {
     auto var = tensors[i];
     auto grad_tensor = grad_tensors[i];
 
     auto init_node = var->GradVarBase()->GradNode();
+
     PADDLE_ENFORCE_EQ(
         var->GradVarBase()->GraphIsFreed(), false,
         platform::errors::Unavailable(
@@ -101,6 +107,16 @@ void BasicEngine::Init(
           *dev_ctx, grad_var);
     }
 
+    VariableWrapper* init_grad_var = var->GradVarBase()->SharedVar().get();
+    auto& accumulator = accumulators_[init_grad_var];
+    if (!accumulator) {
+      if (FLAGS_sort_sum_gradient) {
+        accumulator.reset(new SortedGradientAccumulator(init_grad_var));
+      } else {
+        accumulator.reset(new EagerGradientAccumulator(init_grad_var));
+      }
+    }
+
     init_nodes_.push_back(init_node);
   }
 }
@@ -237,10 +253,6 @@ void BasicEngine::PrepareDeps() {
       node_deps_.empty(), true,
       platform::errors::AlreadyExists("Op deps are not empty before preparing "
                                       "it for backward network execution."));
-  PADDLE_ENFORCE_EQ(accumulators_.empty(), true,
-                    platform::errors::AlreadyExists(
-                        "Accumulators are not empty before preparing it for "
-                        "backward network execution."));
   PADDLE_ENFORCE_EQ(accumulators_with_grad_node_.empty(), true,
                     platform::errors::AlreadyExists(
                         "Accumulators with grad_node as the key are not empty "
@@ -311,7 +323,9 @@ void BasicEngine::Execute() {
   // Start execute Computation graph
   std::queue<std::shared_ptr<GradOpNode>> q;
   for (size_t i = 0; i < init_nodes_.size(); ++i) {
-    q.push(std::move(init_nodes_[i]));
+    if (node_deps_[init_nodes_[i].get()] == 0) {
+      q.push(std::move(init_nodes_[i]));
+    }
   }
 
   size_t op_num = 0;
diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc
index 16f9454e9376e..ba9b70aea7b96 100644
--- a/paddle/fluid/imperative/bkcl_context.cc
+++ b/paddle/fluid/imperative/bkcl_context.cc
@@ -92,7 +92,7 @@ void BKCLParallelContext::Init() {
             << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
             << " ring id: " << ring_id;
     // it will assign bkcl_comm in XPUDeviceContext within ring_id
-    platform::BKCLCommContext::Instance().CreateBKCLComm(
+    platform::BKCLCommContext::Instance().CreateComm(
         &bkcl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, xpu_id,
         ring_id);
   }
@@ -116,7 +116,7 @@ void BKCLParallelContext::InitWithRingID(int ring_id) {
           << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id
           << " ring id: " << ring_id;
   // it will assign bkcl_comm in XPUDeviceContext within ring_id
-  platform::BKCLCommContext::Instance().CreateBKCLComm(
+  platform::BKCLCommContext::Instance().CreateComm(
       &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id);
 }
 
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 9f036742f0f5d..32becda4edc95 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -75,7 +75,7 @@ void NCCLParallelContext::Init() {
             << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
             << " ring id: " << ring_id;
     // it will assign nccl_comm in CUDADeviceContext within ring_id
-    platform::NCCLCommContext::Instance().CreateNCCLComm(
+    platform::NCCLCommContext::Instance().CreateComm(
         &nccl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, gpu_id,
         ring_id);
 
@@ -108,7 +108,7 @@ void NCCLParallelContext::InitWithRingID(int ring_id) {
           << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id
           << " ring id: " << ring_id;
   // it will assign nccl_comm in CUDADeviceContext within ring_id
-  platform::NCCLCommContext::Instance().CreateNCCLComm(
+  platform::NCCLCommContext::Instance().CreateComm(
       &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id);
 
   compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 57c6ae3cbb0a1..619d31c4f5b25 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -17,7 +17,9 @@
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
-
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
+#endif
 DECLARE_bool(check_nan_inf);
 
 namespace paddle {
@@ -104,7 +106,10 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   // Const qualifier of Attrs had to be discarded to overwrite it.
   if (FLAGS_use_mkldnn) {
     auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
-    mutable_op_attrs = attrs;
+    mutable_op_attrs = default_attrs;
+    for (auto& attr : attrs) {
+      mutable_op_attrs[attr.first] = attr.second;
+    }
   }
 #endif
 
@@ -127,7 +132,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
   if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_)) {
+      is_xpu_place(expected_kernel_key.place_) &&
+      !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) {
     VLOG(3) << "missing XPU kernel: " << op.Type()
             << ", expected_kernel_key:" << expected_kernel_key
             << ", fallbacking to CPU one!";
diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
index a090f1a90189b..f497acc4b166c 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc
@@ -127,6 +127,49 @@ TEST(test_resnet50, serial_diff_batch_trt_fp32) {
   std::cout << "finish test" << std::endl;
 }
 
+TEST(test_resnet50, multi_thread4_trt_fp32_bz2) {
+  int thread_num = 4;
+  // init input data
+  std::map<std::string, paddle::test::Record> my_input_data_map;
+  my_input_data_map["inputs"] = PrepareInput(2);
+  // init output data
+  std::map<std::string, paddle::test::Record> infer_output_data,
+      truth_output_data;
+  // prepare groudtruth config
+  paddle_infer::Config config, config_no_ir;
+  config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                        FLAGS_modeldir + "/inference.pdiparams");
+  config_no_ir.SwitchIrOptim(false);
+  // prepare inference config
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  config.EnableUseGpu(100, 0);
+  config.EnableTensorRtEngine(
+      1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false);
+  // get groudtruth by disbale ir
+  paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
+  SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
+                         &truth_output_data, 1);
+
+  // get infer results from multi threads
+  std::vector<std::thread> threads;
+  services::PredictorPool pred_pool(config, thread_num);
+  for (int i = 0; i < thread_num; ++i) {
+    threads.emplace_back(paddle::test::SingleThreadPrediction,
+                         pred_pool.Retrive(i), &my_input_data_map,
+                         &infer_output_data, 2);
+  }
+
+  // thread join & check outputs
+  for (int i = 0; i < thread_num; ++i) {
+    LOG(INFO) << "join tid : " << i;
+    threads[i].join();
+    CompareRecord(&truth_output_data, &infer_output_data);
+  }
+
+  std::cout << "finish multi-thread test" << std::endl;
+}
+
 }  // namespace paddle_infer
 
 int main(int argc, char** argv) {
diff --git a/paddle/fluid/inference/tests/infer_ut/test_suite.h b/paddle/fluid/inference/tests/infer_ut/test_suite.h
index c3c1b36a6e07a..0e116b01847bf 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_suite.h
+++ b/paddle/fluid/inference/tests/infer_ut/test_suite.h
@@ -18,6 +18,7 @@
 #include <iostream>
 #include <numeric>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "gflags/gflags.h"
@@ -117,5 +118,5 @@ void CompareRecord(std::map<std::string, Record> *truth_output_data,
   }
 }
 
-}  // namespace demo
+}  // namespace test
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 8bc9775381be5..bfc4a1d598200 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -33,7 +33,7 @@
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 #include "paddle/fluid/platform/npu_info.h"
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index bc72b4b20d061..6c2fb82cb7cbe 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -31,7 +31,7 @@
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
 DEFINE_bool(init_allocated_mem, false,
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f2f8c5d1fb555..3b3be9776c4c5 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index bb520c270fa2c..1ccd99c71f339 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -397,6 +397,40 @@ class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class ReciprocalNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto place = ctx.GetPlace();
+    out->mutable_data<T>(place);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner = NpuOpRunner("Reciprocal", {*x}, {*out}, {});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReciprocalGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto place = ctx.GetPlace();
+    dx->mutable_data<T>(place);
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    const auto& runner_dx =
+        NpuOpRunner("ReciprocalGrad", {*out, *dout}, {*dx}, {});
+    runner_dx.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -483,3 +517,17 @@ REGISTER_OP_NPU_KERNEL(
     ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
                                   paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    reciprocal,
+    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::ReciprocalNPUKernel<paddle::platform::NPUDeviceContext,
+                             paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    reciprocal_grad,
+    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::ReciprocalGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                 paddle::platform::float16>);
diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
index 2c7219ef6885b..257a91d7c15d7 100644
--- a/paddle/fluid/operators/activation_op_xpu.cc
+++ b/paddle/fluid/operators/activation_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_value_op_npu.cc b/paddle/fluid/operators/assign_value_op_npu.cc
new file mode 100644
index 0000000000000..c05d18dc0f30b
--- /dev/null
+++ b/paddle/fluid/operators/assign_value_op_npu.cc
@@ -0,0 +1,22 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(assign_value, ops::AssignValueKernel<bool>,
+                       ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<int64_t>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index d9a4f0da13c77..3c51c65b07390 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -123,30 +123,32 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
 // return true if found_inf_or_nan or return false;
 template <typename T>
-bool ContainsNan(const framework::ExecutionContext& exe_ctx, aclrtStream stream,
-                 const paddle::framework::Tensor* in) {
+bool CheckNumerics(const framework::ExecutionContext& exe_ctx,
+                   aclrtStream stream, const paddle::framework::Tensor* in) {
   auto& dev_ctx =
       exe_ctx.template device_context<paddle::platform::NPUDeviceContext>();
   using Tensor = paddle::framework::Tensor;
   Tensor out(in->type());
-
-  Tensor mean(in->type());
-  mean.Resize({1});
-  mean.mutable_data<T>(dev_ctx.GetPlace());
-  std::vector<int> axes;
-  for (int i = 0; i < in->dims().size(); ++i) {
-    axes.push_back(i);
+  out.Resize(in->dims());
+  out.mutable_data<T>(dev_ctx.GetPlace());
+
+  bool found_inf_data = false;
+
+  try {
+    const auto& runner =
+        NpuOpRunner("CheckNumerics", {*in}, {out},
+                    {{"message", std::string("check_numberics")}});
+    runner.Run(stream);
+    dev_ctx.Wait();
+  } catch (platform::EnforceNotMet& exception) {
+    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+    found_inf_data = true;
+  } catch (...) {
+    LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
+    found_inf_data = true;
   }
-  const auto& runner_mean = NpuOpRunner("ReduceMeanD", {*in}, {mean},
-                                        {{"axes", axes}, {"keep_dims", false}});
-
-  std::vector<T> vec;
-  TensorToVector(mean, exe_ctx.device_context(), &vec);
 
-  if (std::isnan(static_cast<float>(vec[0]))) {
-    return true;
-  }
-  return false;
+  return found_inf_data;
 }
 #endif
 
@@ -214,22 +216,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
     framework::Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
 
-    bool has_nan = false;
+    bool check_numerics = false;
 
     auto d_type = in->type();
     switch (d_type) {
       case framework::proto::VarType::FP16:
       case framework::proto::VarType::FP32: {
-        VLOG(4) << "prepare to check nan";
-        has_nan = ContainsNan<T>(ctx, dev_ctx->stream(), in);
-        VLOG(4) << "ContainsNan:" << has_nan;
+        VLOG(4) << "prepare to FoundNanInf";
+        check_numerics = CheckNumerics<T>(ctx, dev_ctx->stream(), in);
+        VLOG(4) << "check_numerics:" << check_numerics;
         break;
       }
       default:
         break;
     }
 
-    if (has_nan) {
+    if (check_numerics) {
       T inf = static_cast<T>(std::numeric_limits<float>::infinity());
       VLOG(4) << "fill input data constant inf";
       auto dims = in->dims();
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index f4510861672ca..9bf86dc926773 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -24,15 +24,16 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL)
-#include "paddle/fluid/platform/collective_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
@@ -46,56 +47,51 @@ class CCommInitOp : public framework::OperatorBase {
 
   void RunImpl(const framework::Scope& scope,
                const platform::Place& place) const override {
+// TODO(wangxi): Put this in the unified header file
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    using UniqueId = ncclUniqueId;
+    using Place = platform::CUDAPlace;
+    using CommContext = platform::NCCLCommContext;
+#elif defined(PADDLE_WITH_XPU_BKCL)
+    using UniqueId = BKCLUniqueId;
+    using Place = platform::XPUPlace;
+    using CommContext = platform::BKCLCommContext;
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should be compiled with GPU or XPU."));
+#endif
+
     PADDLE_ENFORCE_EQ(is_gpu_place(place) || is_xpu_place(place), true,
                       platform::errors::PreconditionNotMet(
                           "CCommInitOp can run on gpu or xpu place only."));
 
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
     auto var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::InvalidArgument("Input con not be empty."));
-    if (is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
-
-      int nranks = Attr<int>("nranks");
-      int rank_id = Attr<int>("rank");
-      int rid = Attr<int>("ring_id");
-      int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
-      if (Attr<int>("device_id") >= 0) {
-        device_id = Attr<int>("device_id");
-      }
-      platform::NCCLCommContext::Instance().CreateNCCLComm(
-          nccl_id, nranks, rank_id, device_id, rid);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should be compiled with GPU."));
-#endif
-    } else if (is_xpu_place(place)) {
+
+    UniqueId* comm_id = var->GetMutable<UniqueId>();
+
+    int nranks = Attr<int>("nranks");
+    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
+
 #if defined(PADDLE_WITH_XPU_BKCL)
-      BKCLUniqueId* bkcl_id = var->GetMutable<BKCLUniqueId>();
-
-      int nranks = Attr<int>("nranks");
-      int rank_id = Attr<int>("rank");
-      int rid = Attr<int>("ring_id");
-      PADDLE_ENFORCE_EQ(
-          rid, 0,
-          platform::errors::OutOfRange(
-              "Ring id must equal 0 in multi Kunlun cards training, but got %d",
-              rid));
-      int device_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
-      if (Attr<int>("device_id") >= 0) {
-        device_id = Attr<int>("device_id");
-      }
-      platform::BKCLCommContext::Instance().CreateBKCLComm(
-          bkcl_id, nranks, rank_id, device_id, rid);
-#else
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "PaddlePaddle should be compiled with XPU."));
+    PADDLE_ENFORCE_EQ(
+        rid, 0,
+        platform::errors::OutOfRange(
+            "Ring id must equal 0 in multi Kunlun cards training, but got %d",
+            rid));
 #endif
-    } else {
-      PADDLE_THROW(platform::errors::PreconditionNotMet(
-          "CCommInitOp can run on gpu or xpu place only."));
+
+    int device_id = BOOST_GET_CONST(Place, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
     }
+    CommContext::Instance().CreateComm(comm_id, nranks, rank_id, device_id,
+                                       rid);
+#endif
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
index 65685902b422e..ec174ad0e56bc 100644
--- a/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_bkcl_id_op.cc
@@ -62,7 +62,7 @@ class CGenBKCLIdOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     int rank = Attr<int>("rank");
-    framework::Scope& local_scope = scope.NewScope();
+    int ring_id = Attr<int>("ring_id");
 
     std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
       return Output("Out");
@@ -75,14 +75,13 @@ class CGenBKCLIdOp : public framework::OperatorBase {
       GenBKCLID(&bkcl_ids);
       std::vector<std::string> endpoint_list =
           Attr<std::vector<std::string>>("other_endpoints");
-      platform::SendBroadCastCommID(endpoint_list, &bkcl_ids);
+      platform::SendBroadCastCommID(endpoint_list, &bkcl_ids, ring_id);
     } else {
       std::string endpoint = Attr<std::string>("endpoint");
-      platform::RecvBroadCastCommID(endpoint, &bkcl_ids);
+      platform::RecvBroadCastCommID(endpoint, &bkcl_ids, ring_id);
     }
 
     CopyBKCLIDToVar(bkcl_ids, func, scope);
-    scope.DeleteScope(&local_scope);
   }
 };
 
@@ -108,6 +107,8 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
                  "(int default 0) "
                  "The rank of the trainer in distributed training.")
         .SetDefault(0);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
index af1e576a8c74f..9ab7d90efaa9f 100644
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -63,7 +63,7 @@ class CGenHCCLIdOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     int rank = Attr<int>("rank");
-    framework::Scope& local_scope = scope.NewScope();
+    int ring_id = Attr<int>("ring_id");
 
     std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
       return Output("Out");
@@ -79,13 +79,12 @@ class CGenHCCLIdOp : public framework::OperatorBase {
       GenHCCLID(&hccl_ids);
       std::vector<std::string> endpoint_list =
           Attr<std::vector<std::string>>("other_endpoints");
-      platform::SendBroadCastCommID(endpoint_list, &hccl_ids);
+      platform::SendBroadCastCommID(endpoint_list, &hccl_ids, ring_id);
     } else {
-      platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids, ring_id);
     }
 
     CopyHCCLIDToVar(hccl_ids, func, scope);
-    scope.DeleteScope(&local_scope);
   }
 };
 
@@ -128,6 +127,8 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
                  "(int default 0) "
                  "The rank of the trainer in distributed training.")
         .SetDefault(0);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 470537582e978..0a0a824b77586 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -60,7 +60,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope& scope,
                const platform::Place& dev_place) const override {
     int rank = Attr<int>("rank");
-    framework::Scope& local_scope = scope.NewScope();
+    int ring_id = Attr<int>("ring_id");
 
     std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
       return Output("Out");
@@ -76,13 +76,12 @@ class CGenNCCLIdOp : public framework::OperatorBase {
       GenNCCLID(&nccl_ids);
       std::vector<std::string> endpoint_list =
           Attr<std::vector<std::string>>("other_endpoints");
-      platform::SendBroadCastCommID(endpoint_list, &nccl_ids);
+      platform::SendBroadCastCommID(endpoint_list, &nccl_ids, ring_id);
     } else {
-      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids);
+      platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids, ring_id);
     }
 
     CopyNCCLIDToVar(nccl_ids, func, scope);
-    scope.DeleteScope(&local_scope);
   }
 };
 
@@ -123,6 +122,8 @@ For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the ser
                  "(int default 0) "
                  "The rank of the trainer in distributed training.")
         .SetDefault(0);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
   }
 };
 
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index be299babdba7a..dc9359ecf5c3d 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index c6cd45dc18ba3..4c0ef02074e2e 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -169,6 +169,35 @@ void ChooseAlgo(const std::vector<PerfType>& perf_results,
 
 using framework::ConvSearchCache;
 
+static void SetConvMathType(const framework::ExecutionContext& ctx,
+                            cudnnDataType_t dtype,
+                            const platform::ConvolutionDescriptor& cdesc) {
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+    VLOG(5) << "use cudnn_tensor_op_math";
+#if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+  } else if (dev_ctx.GetComputeCapability() >= 80 &&
+             dtype == CUDNN_DATA_BFLOAT16) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+#endif  // CUDNN_VERSION_MIN(8, 1, 0)
+  } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cdesc.desc(), CUDNN_FMA_MATH));
+#endif  // CUDA_VERSION >= 11000
+  } else {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+        cdesc.desc(), CUDNN_DEFAULT_MATH));
+    VLOG(5) << "NOT use cudnn_tensor_op_math";
+  }
+#endif
+  return;
+}
+
 struct ConvArgs {
   cudnnHandle_t handle;
   platform::TensorDescriptor idesc, odesc;
@@ -208,36 +237,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
     algo_t algo;
-
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-#if CUDA_VERSION >= 11000
-#if CUDNN_VERSION_MIN(8, 1, 0)
-    } else if (dev_ctx.GetComputeCapability() >= 80 &&
-               dtype == CUDNN_DATA_BFLOAT16) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-#endif  // CUDNN_VERSION >= 8100
-    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_FMA_MATH));
-      VLOG(5) << "use cudnn_fma_math";
-#endif  // CUDA_VERSION >= 11000
-    } else {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_DEFAULT_MATH));
-      VLOG(5) << "use cudnn_default_math";
-    }
-#endif
+    SetConvMathType(ctx, dtype, args.cdesc);
 
     if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
@@ -353,24 +353,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     size_t workspace_size = 0;
     bool has_got_workspace_size = true;
     algo_t algo;
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
-#if CUDA_VERSION >= 11000
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_FMA_MATH));
-#endif  // CUDA_VERSION >= 11000
-    }
-#endif
+    SetConvMathType(ctx, dtype, args.cdesc);
 
     if (!exhaustive_search && !deterministic) {
 #if CUDNN_VERSION >= 7001
@@ -501,25 +484,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024;
     size_t workspace_size = 0;
     bool has_got_workspace_size = true;
-
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
-    if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_TENSOR_OP_MATH));
-      VLOG(5) << "use cudnn_tensor_op_math";
-    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
-#if CUDA_VERSION >= 11000
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
-                                                         CUDNN_FMA_MATH));
-#endif  // CUDA_VERSION >= 11000
-    }
-#endif
+    SetConvMathType(ctx, dtype, args.cdesc);
 
     algo_t algo;
     if (!exhaustive_search && !deterministic) {
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
new file mode 100644
index 0000000000000..86c872b74ceeb
--- /dev/null
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/crop_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CropNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+
+    std::vector<int> offset_list;
+    if (ctx.HasInput("Offsets")) {
+      auto* offsets_tensor = ctx.Input<framework::Tensor>("Offsets");
+      TensorToVector(*offsets_tensor, ctx.device_context(), &offset_list);
+      if (offset_list.empty()) {
+        offset_list.resize(x->dims().size(), 0);
+      }
+    } else {
+      auto res = ctx.Attr<std::vector<int>>("offsets");
+      if (res.empty()) {
+        offset_list.resize(x->dims().size(), 0);
+      } else {
+        offset_list.insert(offset_list.end(), res.begin(), res.end());
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(
+        static_cast<int64_t>(offset_list.size()), x->dims().size(),
+        platform::errors::InvalidArgument(
+            "The shape (%d) of CropOp's "
+            "'offset' attribute should be equal to the shape of dims "
+            "(%d) of the Input(X).",
+            offset_list.size(), x->dims().size()));
+
+    int axis_int = 0;
+    framework::NPUAttributeMap attr_input = {{"offsets", offset_list},
+                                             {"axis", axis_int}};
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    if (ctx.HasInput("Y")) {
+      auto* shape = ctx.Input<framework::Tensor>("Y");
+      PADDLE_ENFORCE_EQ(shape->dims().size(), x->dims().size(),
+                        platform::errors::InvalidArgument(
+                            "The shape of dims of (%d) of CropOp's "
+                            "Input(shape) should be equal to the shape of dims "
+                            "(%d) of the Input(X).",
+                            shape->dims().size(), x->dims().size()));
+
+      const auto& runner =
+          NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input);
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    } else {
+      auto shape_size = ctx.Attr<std::vector<int>>("shape");
+      PADDLE_ENFORCE_EQ(shape_size.size(), x->dims().size(),
+                        platform::errors::InvalidArgument(
+                            "The shape of dims of (%d) of CropOp's "
+                            "Input(shape) should be equal to the shape of dims "
+                            "(%d) of the Input(X).",
+                            shape_size.size(), x->dims().size()));
+      Tensor tmp_shape(x->type());
+      tmp_shape.Resize(framework::make_ddim(shape_size));
+      tmp_shape.mutable_data<T>(ctx.GetPlace());
+      const auto& runner =
+          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    crop, ops::CropNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::CropNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::CropNPUKernel<paddle::platform::NPUDeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/deformable_conv_op_xpu.cc b/paddle/fluid/operators/deformable_conv_op_xpu.cc
index 18bab83b0edb8..457616756215c 100644
--- a/paddle/fluid/operators/deformable_conv_op_xpu.cc
+++ b/paddle/fluid/operators/deformable_conv_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc
index 79d239074845a..0b0b7095bd5d1 100644
--- a/paddle/fluid/operators/dropout_op_xpu.cc
+++ b/paddle/fluid/operators/dropout_op_xpu.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index ab0a3336b361f..6fc371ee37c52 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -91,13 +91,10 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_NPU_KERNEL(
-    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
 
 REGISTER_OP_NPU_KERNEL(
-    mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    mean_grad,
     ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index d73e46df3491b..cb75616221bc4 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/metrics/accuracy_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
index 9dcf012d512a9..a0cdd69515da3 100644
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -41,10 +41,13 @@ class MulNPUKernel : public framework::OpKernel<T> {
                         {{"transpose_x1", false}, {"transpose_x2", false}});
 
         runner.Run(stream);
-      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
         // reshape
         Tensor tmp_x(x->type());
-        int64_t sec_dim = x->dims()[1] * x->dims()[2];
+        int64_t sec_dim = x->dims()[1];
+        for (auto i = 2; i < x->dims().size(); i++) {
+          sec_dim *= x->dims()[i];
+        }
         int64_t first_dim = x->dims()[0];
         tmp_x.ShareDataWith(*x);
         tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
@@ -56,7 +59,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
         runner.Run(stream);
       } else {
         PADDLE_THROW(
-            platform::errors::InvalidArgument("npu error: not suppert dims"));
+            platform::errors::InvalidArgument("npu error: not support dims"));
       }
       // to do other
     } else if (x->dims().size() == 3 && y->dims().size() == 2) {
@@ -135,7 +138,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
 
           runner_dy.Run(stream);
         }
-      } else if (x->dims().size() == 3 && y->dims().size() == 2) {
+      } else if (x->dims().size() >= 3 && y->dims().size() == 2) {
         // flatten => x.shape=[6, 4]
         // matmul
         if (dx) {
@@ -154,7 +157,10 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
         if (dy) {
           // flatten
           Tensor tmp_x(x->type());
-          int64_t sec_dim = x->dims()[1] * x->dims()[2];
+          int64_t sec_dim = x->dims()[1];
+          for (auto i = 2; i < x->dims().size(); i++) {
+            sec_dim *= x->dims()[i];
+          }
           int64_t first_dim = x->dims()[0];
           tmp_x.ShareDataWith(*x);
           tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
index 9cc8ac200b8ee..4f98dde210f7a 100644
--- a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/operators/reduce_ops/logsumexp_op.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
index a4ed0c85f4f9d..ae27a5d7df473 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_xpu.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
index fa9503ec3f0ae..5ae60713bc912 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op_xpu.h
@@ -21,7 +21,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
index bf55221bd3ffd..f759b104d01d1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_xpu.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include "paddle/fluid/operators/reduce_ops/reduce_op_xpu.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index fb82d18e62f3b..9d637e1cee117 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index fdb90797b69db..e0dfad91570ad 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
index 86fe826c659ef..a164a9b056677 100644
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index f8272d550b999..d53ab914db4d7 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -31,7 +31,13 @@ class StridedSliceOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext *ctx) const override {
     OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "StridedSlice");
     OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "StridedSlice");
-
+    auto input_var_type = ctx->GetInputsVarType("Input")[0];
+    if (input_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      if (ctx->IsRuntime()) {
+        // shape is determined by Runtime.
+        return;
+      }
+    }
     auto in_dims = ctx->GetInputDim("Input");
     PADDLE_ENFORCE_LT(
         in_dims.size(), 7,
@@ -154,6 +160,27 @@ class StridedSliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    auto *in_var = ctx.InputVar("Input");
+    auto is_in_var_array = in_var->IsType<framework::LoDTensorArray>();
+    if (is_in_var_array) {
+      auto &tensor_array = in_var->Get<framework::LoDTensorArray>();
+      for (auto &tensor : tensor_array) {
+        if (!platform::is_cuda_pinned_place(tensor.place())) {
+          PADDLE_ENFORCE_EQ(
+              platform::is_same_place(tensor.place(),
+                                      ctx.device_context().GetPlace()),
+              true,
+              platform::errors::InvalidArgument(
+                  "Place of context is %s. Place of input tensor is %s. They "
+                  "are should be same, but reveived different place.",
+                  string::to_string(ctx.device_context().GetPlace()),
+                  string::to_string(tensor.place())));
+        }
+      }
+      return framework::OpKernelType(
+          OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+          ctx.device_context());
+    }
     // NOTE: cuda pinned tensor need to copy its data to target place
     auto in_tensor = ctx.Input<Tensor>("Input");
     if (platform::is_cuda_pinned_place(in_tensor->place())) {
@@ -179,6 +206,14 @@ class StridedSliceOp : public framework::OperatorWithKernel {
   }
 };
 
+class StridedSliceOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SetOutputType("Out", ctx->GetInputType("Input"));
+    ctx->SetOutputDataType("Out", ctx->GetInputDataType("Input"));
+  }
+};
+
 class StridedSliceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -259,6 +294,13 @@ class StridedSliceOpGrad : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
                    "Out@GRAD", "StridedSliceGrad");
 
+    auto input_var_type = ctx->GetInputsVarType("Input")[0];
+    if (input_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) {
+      if (ctx->IsRuntime()) {
+        // shape is determined by Runtime
+        return;
+      }
+    }
     auto x_dims = ctx->GetInputDim("Input");
     auto x_grad_name = framework::GradVarName("Input");
     if (ctx->HasOutput(x_grad_name)) {
@@ -308,6 +350,16 @@ class StridedSliceOpGradMaker : public framework::SingleGradOpMaker<T> {
     bind->SetType("strided_slice_grad");
   }
 };
+class StridedSliceGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    ctx->SetOutputType(framework::GradVarName("Input"),
+                       ctx->GetInputType(framework::GradVarName("Out")));
+    ctx->SetOutputDataType(
+        framework::GradVarName("Input"),
+        ctx->GetInputDataType(framework::GradVarName("Out")));
+  }
+};
 
 DECLARE_NO_NEED_BUFFER_VARS_INFERER(StridedSliceOpGradNoNeedBufferVarsInferer,
                                     "Input");
@@ -318,9 +370,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(StridedSliceOpGradNoNeedBufferVarsInferer,
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(strided_slice, ops::StridedSliceOp, ops::StridedSliceOpMaker,
                   ops::StridedSliceOpGradMaker<paddle::framework::OpDesc>,
-                  ops::StridedSliceOpGradMaker<paddle::imperative::OpBase>);
+                  ops::StridedSliceOpGradMaker<paddle::imperative::OpBase>,
+                  ops::StridedSliceOpVarTypeInference);
+
 REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad,
-                  ops::StridedSliceOpGradNoNeedBufferVarsInferer);
+                  ops::StridedSliceOpGradNoNeedBufferVarsInferer,
+                  ops::StridedSliceGradOpVarTypeInference);
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice,
diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
index 3c5fb869f68f1..e5b808174ace4 100644
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
@@ -127,6 +127,9 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends,
       if (!(ends[axis_index] == -1 &&
             strides[axis_index] < 0)) {  // skip None stop condition
         ends[axis_index] = ends[axis_index] + axis_size;
+        if (ends[axis_index] < 0) {
+          ends[axis_index] = 0;
+        }
       }
     }
     if (decrease_axis_affect) {
@@ -136,14 +139,19 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends,
         ends[axis_index] = starts[axis_index] + 1;
       }
     }
+
+    if ((starts[axis_index] < 0) && (axis_size > 0)) {
+      starts[axis_index] += axis_size;
+      starts[axis_index] = std::max<int64_t>(starts[axis_index], 0);
+    }
+
     if (strides[axis_index] < 0) {
       reverse_axis[axis_index] = 1;
       strides[axis_index] = -strides[axis_index];
       if (starts[axis_index] > ends[axis_index]) {
         // swap the reverse
-        auto end_dim = dims[axis_index] - 1 < starts[axis_index]
-                           ? dims[axis_index] - 1
-                           : starts[axis_index];
+        auto end_dim = axis_size - 1 < starts[axis_index] ? axis_size - 1
+                                                          : starts[axis_index];
         auto offset = (end_dim - ends[axis_index]) % strides[axis_index];
         offset = offset == 0 ? strides[axis_index] : offset;
 
@@ -162,7 +170,11 @@ template <typename DeviceContext, typename T>
 class StridedSliceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    int rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
+    int rank = is_tensor_array
+                   ? 1
+                   : ctx.Input<framework::Tensor>("Input")->dims().size();
     switch (rank) {
       case 1:
         StridedSliceCompute<1>(ctx);
@@ -190,9 +202,17 @@ class StridedSliceKernel : public framework::OpKernel<T> {
   void StridedSliceCompute(const framework::ExecutionContext& context) const {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    auto in = context.Input<framework::Tensor>("Input");
-    auto out = context.Output<framework::Tensor>("Out");
-    auto in_dims = in->dims();
+
+    framework::DDim in_dims;
+    auto* input_var = context.InputVar("Input");
+
+    bool is_input_var_array = input_var->IsType<LoDTensorArray>();
+    if (is_input_var_array) {
+      const int64_t size = input_var->Get<framework::LoDTensorArray>().size();
+      in_dims = framework::make_ddim({size});
+    } else {
+      in_dims = context.Input<framework::Tensor>("Input")->dims();
+    }
 
     auto starts_int = context.Attr<std::vector<int>>("starts");
     auto ends_int = context.Attr<std::vector<int>>("ends");
@@ -295,29 +315,97 @@ class StridedSliceKernel : public framework::OpKernel<T> {
       }
     }
 
-    out->Resize(out_dims);
-    out->mutable_data<T>(context.GetPlace());
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *in);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *out, out_dims);
-    if (need_reverse) {
-      framework::Tensor tmp;
-      tmp.mutable_data<T>(out_dims, context.GetPlace());
-      auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
-                                          Eigen::DenseIndex>::From(tmp);
-      tmp_t.device(place) =
-          in_t.stridedSlice(starts_indices, ends_indices, strides_indices);
-      out_t.device(place) = tmp_t.reverse(reverse_axis);
+    if (is_input_var_array) {
+      PADDLE_ENFORCE_EQ(
+          starts_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_op' is `TensorArray`, the "
+              "dimension of start index  should be 1, but received %d.",
+              starts_indices.size()));
+
+      PADDLE_ENFORCE_EQ(
+          ends_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_op' is `TensorArray`, the "
+              "dimension of end index should be 1, but received %d.",
+              ends_indices.size()));
+
+      PADDLE_ENFORCE_EQ(
+          strides_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_op' is `TensorArray`, the "
+              "dimension of stride should be 1, but received %d.",
+              strides_indices.size()));
+
+      auto* output_var = context.OutputVar("Out");
+
+      PADDLE_ENFORCE_EQ(
+          output_var->IsType<LoDTensorArray>(), true,
+          platform::errors::InvalidArgument(
+              "When the input of `strided_slice_op` is `TensorArray`. The "
+              "output is excepted `TensorArray` , but received %s.",
+              framework::ToTypeName(output_var->Type())));
+
+      PADDLE_ENFORCE_EQ(
+          out_dims_origin.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_op' is `TensorArray`, the "
+              "dimension of Output should be 1, but received %d",
+              out_dims_origin.size()));
+
+      auto& in_array = input_var->Get<framework::LoDTensorArray>();
+
+      auto* out_array = context.Output<framework::LoDTensorArray>("Out");
+
+      out_array->resize(out_dims_origin[0]);
+      size_t const in_array_size = in_array.size();
+      for (size_t i = 0; i < out_array->size(); i++) {
+        size_t in_offset =
+            (starts_indices[0] % in_array_size) + i * strides_indices[0];
+
+        int64_t out_offset = i;
+        if (need_reverse) {
+          out_offset = out_array->size() - i - 1;
+        }
+
+        auto& in_tensor = in_array.at(in_offset);
+        PADDLE_ENFORCE_GT(
+            in_tensor.memory_size(), 0,
+            platform::errors::PreconditionNotMet(
+                "The input LoDTensorArray Input[%d] holds no memory.",
+                in_offset));
+        auto* out_tensor = &out_array->at(out_offset);
+
+        out_tensor->set_lod(in_tensor.lod());
+        TensorCopy(in_tensor, context.GetPlace(), out_tensor);
+      }
+
     } else {
-      out_t.device(place) =
-          in_t.stridedSlice(starts_indices, ends_indices, strides_indices);
-    }
+      auto in = context.Input<framework::Tensor>("Input");
+      auto out = context.Output<framework::Tensor>("Out");
+      out->Resize(out_dims);
+      out->mutable_data<T>(context.GetPlace());
+      auto in_t = framework::EigenTensor<T, D, Eigen::RowMajor,
+                                         Eigen::DenseIndex>::From(*in);
+      auto out_t =
+          framework::EigenTensor<T, D, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::From(*out, out_dims);
+      if (need_reverse) {
+        framework::Tensor tmp;
+        tmp.mutable_data<T>(out_dims, context.GetPlace());
+        auto tmp_t = framework::EigenTensor<T, D, Eigen::RowMajor,
+                                            Eigen::DenseIndex>::From(tmp);
+        tmp_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, strides_indices);
+        out_t.device(place) = tmp_t.reverse(reverse_axis);
+      } else {
+        out_t.device(place) =
+            in_t.stridedSlice(starts_indices, ends_indices, strides_indices);
+      }
 
-    if (decrease_axis.size() > 0) {
-      out->Resize(out_dims_origin);
+      if (decrease_axis.size() > 0) {
+        out->Resize(out_dims_origin);
+      }
     }
   }
 };
@@ -326,7 +414,11 @@ template <typename DeviceContext, typename T>
 class StridedSliceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    size_t rank = ctx.Input<framework::Tensor>("Input")->dims().size();
+    const Variable* input_var = ctx.InputVar("Input");
+    bool is_tensor_array = input_var->IsType<LoDTensorArray>();
+    int rank = is_tensor_array
+                   ? 1
+                   : ctx.Input<framework::Tensor>("Input")->dims().size();
     switch (rank) {
       case 1:
         StridedSliceGradCompute<1>(ctx);
@@ -355,17 +447,27 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
       const framework::ExecutionContext& context) const {
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    auto* d_input =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* d_out =
-        context.Output<framework::Tensor>(framework::GradVarName("Input"));
-    d_out->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
-    set_zero(dev_ctx, d_out, static_cast<T>(0));
-    auto out_dims = d_out->dims();
-    auto in_dims = d_input->dims();
+
+    framework::DDim out_dims;
+    auto* out_var = context.OutputVar(framework::GradVarName("Input"));
+    bool is_out_var_array = out_var->IsType<LoDTensorArray>();
+    if (is_out_var_array) {
+      // Note(weixin):Since the shape of `framework::GradVarName("Input")` of
+      // StridedSliceGrad cannot be calculated by
+      // `framework::GradVarName("Output")`, the dim of "Input" is used to
+      // calculate the output shape. when set it to inplace OP, there may be
+      // some problems.
+      const int64_t size =
+          context.Input<framework::LoDTensorArray>("Input")->size();
+
+      out_dims = framework::make_ddim({size});
+    } else {
+      out_dims =
+          context.Output<framework::Tensor>(framework::GradVarName("Input"))
+              ->dims();
+    }
 
     auto starts_int = context.Attr<std::vector<int>>("starts");
     auto ends_int = context.Attr<std::vector<int>>("ends");
@@ -438,25 +540,121 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
         break;
       }
     }
-    auto in_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
-    auto out_t =
-        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_out, out_dims);
-    if (need_reverse) {
-      framework::Tensor reverse_input;
-      reverse_input.mutable_data<T>(in_dims, context.GetPlace());
-      auto reverse_in_t =
-          framework::EigenTensor<T, D, Eigen::RowMajor,
-                                 Eigen::DenseIndex>::From(reverse_input);
 
-      reverse_in_t.device(place) = in_t.reverse(reverse_axis);
-      out_t.stridedSlice(starts_indices, ends_indices, strides_indices)
-          .device(place) = reverse_in_t;
+    if (is_out_var_array) {
+      PADDLE_ENFORCE_EQ(
+          starts_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_grad_op' is `TensorArray`, the "
+              "dimension of start index  should be 1, but received %d.",
+              starts_indices.size()));
+      PADDLE_ENFORCE_EQ(
+          ends_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_op' is `TensorArray`, the "
+              "dimension of end index should be 1, but received %d.",
+              ends_indices.size()));
+      PADDLE_ENFORCE_EQ(
+          strides_indices.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the input of 'strided_slice_grad_op' is `TensorArray`, the "
+              "dimension of stride should be 1, but received %d.",
+              strides_indices.size()));
+
+      auto* d_input_var = context.InputVar(framework::GradVarName("Out"));
+
+      PADDLE_ENFORCE_EQ(
+          d_input_var->IsType<LoDTensorArray>(), true,
+          platform::errors::InvalidArgument(
+              "When the output of `strided_slice_grad_op` is "
+              "`TensorArray`, the input is excepted `TensorArray` , "
+              "but received %s.",
+              framework::ToTypeName(d_input_var->Type())));
+
+      PADDLE_ENFORCE_EQ(
+          out_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "When the output of `strided_slice_grad_op` is `TensorArray`, "
+              "the dimension of output should be 1, but received %d.",
+              out_dims.size()));
+      auto& d_in_array = d_input_var->Get<framework::LoDTensorArray>();
+
+      auto* d_out_array = context.Output<framework::LoDTensorArray>(
+          framework::GradVarName("Input"));
+
+      d_out_array->resize(out_dims[0]);
+      auto const d_out_array_size = d_out_array->size();
+      auto* input_tensor_array =
+          context.Input<framework::LoDTensorArray>("Input");
+
+      for (size_t j = 0; j < d_out_array_size; j++) {
+        auto& dim = input_tensor_array->at(j).dims();
+        auto* d_out_tensor = &d_out_array->at(j);
+
+        int64_t sub = j - starts_indices[0];
+
+        int64_t in_offset = sub / strides_indices[0];
+
+        if (need_reverse) {
+          in_offset = d_in_array.size() - in_offset - 1;
+        }
+
+        if ((sub % strides_indices[0] == 0) && (0 <= in_offset) &&
+            (static_cast<size_t>(in_offset) < d_in_array.size())) {
+          auto& in_tensor = d_in_array.at(in_offset);
+          PADDLE_ENFORCE_GT(
+              in_tensor.memory_size(), 0,
+              platform::errors::PreconditionNotMet(
+                  "The input LoDTensorArray Input[%d] holds no memory.",
+                  in_offset));
+
+          d_out_tensor->set_lod(in_tensor.lod());
+          TensorCopy(in_tensor, context.GetPlace(), d_out_tensor);
+
+        } else {
+          d_out_tensor->Resize(dim);
+
+          if (!d_out_tensor->IsInitialized()) {
+            d_out_tensor->mutable_data<T>(context.GetPlace());
+          }
+
+          math::SetConstant<DeviceContext, T> set_zero;
+          set_zero(dev_ctx, d_out_tensor, static_cast<T>(0));
+        }
+      }
+
     } else {
-      out_t.stridedSlice(starts_indices, ends_indices, strides_indices)
-          .device(place) = in_t;
+      auto* d_input =
+          context.Input<framework::Tensor>(framework::GradVarName("Out"));
+      auto* d_out =
+          context.Output<framework::Tensor>(framework::GradVarName("Input"));
+
+      d_out->mutable_data<T>(context.GetPlace());
+
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, d_out, static_cast<T>(0));
+
+      auto in_dims = d_input->dims();
+
+      auto in_t = framework::EigenTensor<T, D, Eigen::RowMajor,
+                                         Eigen::DenseIndex>::From(*d_input);
+      auto out_t =
+          framework::EigenTensor<T, D, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::From(*d_out, out_dims);
+      if (need_reverse) {
+        framework::Tensor reverse_input;
+        reverse_input.mutable_data<T>(in_dims, context.GetPlace());
+        auto reverse_in_t =
+            framework::EigenTensor<T, D, Eigen::RowMajor,
+                                   Eigen::DenseIndex>::From(reverse_input);
+
+        reverse_in_t.device(place) = in_t.reverse(reverse_axis);
+        out_t.stridedSlice(starts_indices, ends_indices, strides_indices)
+            .device(place) = reverse_in_t;
+      } else {
+        out_t.stridedSlice(starts_indices, ends_indices, strides_indices)
+            .device(place) = in_t;
+      }
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index 264cc4e2cf794..d16bb5562ed3a 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op_xpu.cc b/paddle/fluid/operators/transpose_op_xpu.cc
index 2748c07f9e6d7..360c2125ed1f6 100644
--- a/paddle/fluid/operators/transpose_op_xpu.cc
+++ b/paddle/fluid/operators/transpose_op_xpu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 36a956762174e..efd25bc892940 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -69,7 +69,8 @@ cc_library(place SRCS place.cc DEPS enforce boost)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
 if(WITH_XPU)
-cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_info SRCS xpu/xpu_info.cc DEPS gflags glog enforce xpulib)
+cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib)
 endif()
 
 if(WITH_ASCEND)
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index f2b478f7d20e9..cc9f2c75989db 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -72,8 +72,8 @@ class NCCLCommImpl : public NCCLComm {
   std::shared_ptr<platform::CudaEventObject> comm_event_;
 };
 
-NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks,
-                                          int rank, int dev_id, int ring_id) {
+NCCLComm* NCCLCommContext::CreateComm(ncclUniqueId* nccl_id, int nranks,
+                                      int rank, int dev_id, int ring_id) {
   PADDLE_ENFORCE_NOT_NULL(nccl_id,
                           platform::errors::InvalidArgument(
                               "The nccl unique id should not be null."));
@@ -225,8 +225,8 @@ class BKCLCommImpl : public BKCLComm {
   std::unique_ptr<XPUDeviceContext> dev_ctx_;
 };
 
-BKCLComm* BKCLCommContext::CreateBKCLComm(BKCLUniqueId* bkcl_id, int nranks,
-                                          int rank, int dev_id, int ring_id) {
+BKCLComm* BKCLCommContext::CreateComm(BKCLUniqueId* bkcl_id, int nranks,
+                                      int rank, int dev_id, int ring_id) {
   PADDLE_ENFORCE_NOT_NULL(bkcl_id,
                           platform::errors::InvalidArgument(
                               "The bkcl unique id should not be null."));
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index b0b857f7ee3f2..b9be9dc8304e1 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -72,8 +72,8 @@ class NCCLCommContext {
     return comm_ctx;
   }
 
-  NCCLComm* CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, int rank,
-                           int dev_id, int ring_id = 0);
+  NCCLComm* CreateComm(ncclUniqueId* nccl_id, int nranks, int rank, int dev_id,
+                       int ring_id = 0);
 
   void CreateAllNCCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
 
@@ -274,8 +274,8 @@ class BKCLCommContext {
     return comm_ctx;
   }
 
-  BKCLComm* CreateBKCLComm(BKCLUniqueId* bkcl_id, int nranks, int rank,
-                           int dev_id, int ring_id = 0);
+  BKCLComm* CreateComm(BKCLUniqueId* bkcl_id, int nranks, int rank, int dev_id,
+                       int ring_id = 0);
 
   void CreateAllBKCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
 
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index 8e969588afbbc..486b3346c3760 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -253,8 +253,14 @@ class ConvolutionDescriptor {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(desc,
                                                          CUDNN_TENSOR_OP_MATH));
-    } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) {
 #if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (dtype == CUDNN_DATA_BFLOAT16) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetConvolutionMathType(desc,
+                                                         CUDNN_TENSOR_OP_MATH));
+#endif  // CUDNN_VERSION_MIN(8,1,0)
+    } else if (dtype == CUDNN_DATA_FLOAT && !allow_tf32) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(desc, CUDNN_FMA_MATH));
 #endif  // CUDA_VERSION >= 11000
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 82f14c612d1fa..c7162f58de284 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -196,7 +196,10 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 Place CPUDeviceContext::GetPlace() const { return place_; }
 
 #ifdef PADDLE_WITH_XPU
-XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
+XPUDeviceContext::XPUDeviceContext() {
+  context_ = xpu::create_context();
+  xpu_version_ = get_xpu_version(place_.device);
+}
 
 XPUDeviceContext::~XPUDeviceContext() {}
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 68589f546dc78..abac12ff26648 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -68,8 +68,8 @@ struct GpuDevice;
 }  // namespace Eigen
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -137,12 +137,14 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
 };
 
 #ifdef PADDLE_WITH_XPU
+namespace xpu = baidu::xpu::api;
 class XPUDeviceContext : public DeviceContext {
  public:
   XPUDeviceContext();
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  XPUVersion xpu_version() const { return xpu_version_; }
   Place GetPlace() const override;
   xpu::Context* x_context() const;
 
@@ -159,6 +161,7 @@ class XPUDeviceContext : public DeviceContext {
 
  private:
   XPUPlace place_;
+  XPUVersion xpu_version_;
   xpu::Context* context_;
 #ifdef PADDLE_WITH_XPU_BKCL
   BKCLContext_t bkcl_context_;
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index 5f6dd5679a1a8..73bc2c41a0bc9 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -42,7 +42,10 @@ namespace platform {
 
 std::once_flag SocketServer::init_flag_;
 
-constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+struct CommHead {
+  int version = 1;  // unused for now
+  int ring_id = 0;
+};
 
 // Check system calls, such as socket, bind.
 #define CHECK_SYS_CALL(call, name)          \
@@ -188,11 +191,15 @@ int CreateListenSocket(const std::string& ep) {
 
 void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
 
-static int SocketAccept(int server_fd, const char* head) {
+static int SocketAccept(int server_fd, const CommHead head) {
+  static_assert(sizeof(CommHead) <= 1024,
+                "sizeof(CommHead) must <= buffer size");
+
   struct sockaddr_in client_addr;
   socklen_t addr_length = sizeof(client_addr);
   char buffer[1024] = {0};
   int conn = -1;
+  const char* phead = reinterpret_cast<const char*>(&head);
 
   while (true) {
     CHECK_SYS_CALL_VAL(
@@ -200,8 +207,10 @@ static int SocketAccept(int server_fd, const char* head) {
                &addr_length),
         "accept", conn);
 
-    int ret_val = SocketRecv(conn, buffer, strlen(head));
-    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+    int ret_val = SocketRecv(conn, buffer, sizeof(head));
+    if (ret_val > 0 && memcmp(buffer, phead, sizeof(head)) == 0) {
+      // send a message to the sender, indicating that the link is correct
+      CHECK_SYS_CALL(SocketSend(conn, phead, sizeof(head)), "send");
       break;  // accept client
     } else {
       VLOG(3) << "socket read failed with ret_val=" << ret_val;
@@ -211,7 +220,7 @@ static int SocketAccept(int server_fd, const char* head) {
   return conn;
 }
 
-static int ConnectAddr(const std::string& ep, const char* head) {
+static int ConnectAddr(const std::string& ep, const CommHead head) {
   auto addr = paddle::string::Split(ep, ':');
   PADDLE_ENFORCE_EQ(
       addr.size(), 2UL,
@@ -220,9 +229,6 @@ static int ConnectAddr(const std::string& ep, const char* head) {
   std::string host = addr[0];
   int port = std::stoi(addr[1]);
 
-  int sock = -1;
-  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
-
   struct sockaddr_in server_addr;
   memset(&server_addr, 0, sizeof(server_addr));
   server_addr.sin_family = AF_INET;
@@ -245,10 +251,18 @@ static int ConnectAddr(const std::string& ep, const char* head) {
                     platform::errors::Unavailable("Open address %s failed: %s",
                                                   ep, strerror(errno)));
 
+  static_assert(sizeof(CommHead) <= 1024,
+                "sizeof(CommHead) must <= buffer size");
+  char buffer[1024] = {0};
+  const char* phead = reinterpret_cast<const char*>(&head);
+
   // TODO(wangxi) Set from env, default 900s=15min
   int timeout = 900 * 1000;
   int try_times = 0;
   int total_time = 0;
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
   while (true) {
     int ret_val = -1;
     RETRY_SYS_CALL_VAL(
@@ -260,8 +274,19 @@ static int ConnectAddr(const std::string& ep, const char* head) {
       continue;
     }
 
-    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
-    break;
+    CHECK_SYS_CALL(SocketSend(sock, phead, sizeof(head)), "send");
+    ret_val = SocketRecv(sock, buffer, sizeof(head));
+    if (ret_val > 0 && memcmp(buffer, phead, sizeof(head)) == 0) {
+      // recv same message from recver, indicating that the link is correct
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(sock);
+    }
+    sock = -1;
+    CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+    // unmatched link, retry after 80ms
+    std::this_thread::sleep_for(std::chrono::milliseconds(80));
   }
   return sock;
 }
@@ -295,12 +320,15 @@ static void SendCommID(int conn, CommUniqueId* nccl_id) {
 
 template <typename CommUniqueId>
 void SendBroadCastCommID(std::vector<std::string> servers,
-                         std::vector<CommUniqueId>* nccl_ids) {
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id) {
+  CommHead head;
+  head.ring_id = ring_id;
+
   // connect with server
   std::vector<int> connects;
   for (auto server : servers) {
     VLOG(3) << "connecting endpoint: " << server;
-    int conn = ConnectAddr(server, COMM_HEAD);
+    int conn = ConnectAddr(server, head);
     connects.push_back(conn);
   }
   VLOG(3) << "connecting completed...";
@@ -322,16 +350,18 @@ void SendBroadCastCommID(std::vector<std::string> servers,
 
 template <typename CommUniqueId>
 void RecvBroadCastCommID(std::string endpoint,
-                         std::vector<CommUniqueId>* nccl_ids) {
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id) {
   int server = CreateListenSocket(endpoint);
-  RecvBroadCastCommID(server, endpoint, nccl_ids);
+  RecvBroadCastCommID(server, endpoint, nccl_ids, ring_id);
   CloseSocket(server);
 }
 
 template <typename CommUniqueId>
 void RecvBroadCastCommID(int server_fd, std::string endpoint,
-                         std::vector<CommUniqueId>* nccl_ids) {
-  int client = SocketAccept(server_fd, COMM_HEAD);
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id) {
+  CommHead head;
+  head.ring_id = ring_id;
+  int client = SocketAccept(server_fd, head);
 
   for (size_t i = 0; i < nccl_ids->size(); ++i) {
     VLOG(3) << "trainer: " << endpoint
@@ -360,11 +390,15 @@ SocketServer& SocketServer::GetInstance(const std::string& end_point) {
 }
 
 /// template instantiation
-#define INSTANT_TEMPLATE(Type)                                              \
-  template void SendBroadCastCommID<Type>(std::vector<std::string> servers, \
-                                          std::vector<Type> * nccl_ids);    \
-  template void RecvBroadCastCommID<Type>(std::string endpoint,             \
-                                          std::vector<Type> * nccl_ids);
+#define INSTANT_TEMPLATE(Type)                                                 \
+  template void SendBroadCastCommID<Type>(std::vector<std::string> servers,    \
+                                          std::vector<Type> * nccl_ids,        \
+                                          int ring_id = 0);                    \
+  template void RecvBroadCastCommID<Type>(                                     \
+      std::string endpoint, std::vector<Type> * nccl_ids, int ring_id = 0);    \
+  template void RecvBroadCastCommID<Type>(int server_fd, std::string endpoint, \
+                                          std::vector<Type>* nccl_ids,         \
+                                          int ring_id = 0);
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 INSTANT_TEMPLATE(ncclUniqueId)
diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h
index fb5d8d8fcd940..6198519eb06df 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -31,16 +31,16 @@ void CloseSocket(int fd);
 
 template <typename CommUniqueId>
 void SendBroadCastCommID(std::vector<std::string> servers,
-                         std::vector<CommUniqueId>* nccl_ids);
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id = 0);
 
 template <typename CommUniqueId>
 void RecvBroadCastCommID(std::string endpoint,
-                         std::vector<CommUniqueId>* nccl_ids);
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id = 0);
 
 // recv nccl id from socket
 template <typename CommUniqueId>
 void RecvBroadCastCommID(int server_fd, std::string endpoint,
-                         std::vector<CommUniqueId>* nccl_ids);
+                         std::vector<CommUniqueId>* nccl_ids, int ring_id = 0);
 
 class SocketServer {
  public:
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ac6988d350f4f..2e0ba9d241c72 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -29,8 +29,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_header.h"
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef WITH_WIN_DUMP_DBG
diff --git a/paddle/fluid/platform/xpu/xpu1_op_list.h b/paddle/fluid/platform/xpu/xpu1_op_list.h
new file mode 100644
index 0000000000000..131525718cac7
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu1_op_list.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kl1_ops() {
+  // KL1支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu1_kernels{
+      {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"relu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sigmoid_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gelu_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sqrt_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"square_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"hard_switch_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"leaky_relu_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"abs", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"affine_channel_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"batch_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"clip_by_norm",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"coalesce_tensor",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_reduce_sum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"c_allreduce_sum",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicaland", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logicalnot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"depthwise_conv2d_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"conv2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"deformable_conv",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"deformable_conv_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"dropout_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_sub",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_sub_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_add",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_add_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_div_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_pow",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_floordiv",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_mul",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_mul_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_max",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"elementwise_min_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"fill_constant",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"bilinear_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"nearest_interp_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"layer_norm_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"load", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"log_loss_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lookup_table_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"matmul_v2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mean_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_sum_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_mean", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reduce_max_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"reshape2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"roi_align_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"softmax_with_cross_entropy",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"squeeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"transpose2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"truncated_gaussian_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"uniform_random",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"unsqueeze2_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}
+      // AddMore
+  };
+
+  return s_xpu1_kernels;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu/xpu2_op_list.h b/paddle/fluid/platform/xpu/xpu2_op_list.h
new file mode 100644
index 0000000000000..fc80e5ee962f9
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu2_op_list.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using vartype = paddle::framework::proto::VarType;
+using pOpKernelType = paddle::framework::OpKernelType;
+using XPUKernelSet =
+    std::unordered_set<pOpKernelType, paddle::framework::OpKernelType::Hash>;
+using XPUOpMap = std::unordered_map<std::string, XPUKernelSet>;
+
+XPUOpMap& get_kl2_ops() {
+  // KL1支持的op，通过op_name, data_type, place来索引
+  static XPUOpMap s_xpu2_kernels{
+      {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
+      // AddMore
+  };
+
+  return s_xpu2_kernels;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu/xpu_header.h
similarity index 95%
rename from paddle/fluid/platform/xpu_header.h
rename to paddle/fluid/platform/xpu/xpu_header.h
index 99f4224b5d408..caee41ae299c7 100644
--- a/paddle/fluid/platform/xpu_header.h
+++ b/paddle/fluid/platform/xpu/xpu_header.h
@@ -21,12 +21,9 @@
 
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/float16.h"
-#include "xpu/api.h"
-#include "xpu/refactor/fusion.h"
-#include "xpu/refactor/math.h"
-#include "xpu/refactor/nn.h"
 #include "xpu/runtime.h"
 #include "xpu/runtime_ex.h"
+#include "xpu/xdnn.h"
 
 namespace xpu = baidu::xpu::api;
 
diff --git a/paddle/fluid/platform/xpu_info.cc b/paddle/fluid/platform/xpu/xpu_info.cc
similarity index 86%
rename from paddle/fluid/platform/xpu_info.cc
rename to paddle/fluid/platform/xpu/xpu_info.cc
index f88248fda7e65..6b8ab16b47d68 100644
--- a/paddle/fluid/platform/xpu_info.cc
+++ b/paddle/fluid/platform/xpu/xpu_info.cc
@@ -8,14 +8,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 
 #include <algorithm>
 #include <cstdlib>
 #include <string>
 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 #include "paddle/fluid/string/split.h"
 
 DEFINE_string(selected_xpus, "",
@@ -103,5 +103,21 @@ void SetXPUDeviceId(int id) {
                         ret));
 }
 
+XPUVersion get_xpu_version(int dev_id) {
+  uint64_t v = 0;
+  int ret = xpu_device_get_attr(&v, XPUATTR_MODEL, dev_id);
+  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                    platform::errors::External(
+                        "xpu_device_get_attr return wrong value[%d]", ret));
+
+  if (v == K100 || v == K200) {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU1\n";
+    return XPU1;
+  } else {
+    VLOG(1) << "KUNLUN device " << dev_id << " is XPU2\n";
+    return XPU2;
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/xpu_info.h b/paddle/fluid/platform/xpu/xpu_info.h
similarity index 95%
rename from paddle/fluid/platform/xpu_info.h
rename to paddle/fluid/platform/xpu/xpu_info.h
index 2bf7b0b5cb647..3cb79d51eb7bb 100644
--- a/paddle/fluid/platform/xpu_info.h
+++ b/paddle/fluid/platform/xpu/xpu_info.h
@@ -51,6 +51,9 @@ class XPUDeviceGuard {
   int prev_id_{-1};
 };
 
+enum XPUVersion { XPU1, XPU2 };
+XPUVersion get_xpu_version(int dev_id);
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.cc b/paddle/fluid/platform/xpu/xpu_op_list.cc
new file mode 100644
index 0000000000000..b3349407942bd
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu_op_list.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_XPU
+#include <string>
+
+#include "paddle/fluid/platform/xpu/xpu1_op_list.h"
+#include "paddle/fluid/platform/xpu/xpu2_op_list.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_op_list.h"
+
+namespace paddle {
+namespace platform {
+
+bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) {
+  auto& ops = get_kl1_ops();
+  auto v =
+      get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device);
+  if (v == XPU2) {
+    ops = get_kl2_ops();
+  }
+
+  if (ops.find(op_name) != ops.end() &&
+      ops[op_name].find(type) != ops[op_name].end()) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/xpu/xpu_op_list.h b/paddle/fluid/platform/xpu/xpu_op_list.h
new file mode 100644
index 0000000000000..487bc8ac48b66
--- /dev/null
+++ b/paddle/fluid/platform/xpu/xpu_op_list.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+#include <string>
+
+#include "paddle/fluid/framework/op_kernel_type.h"
+
+namespace paddle {
+namespace platform {
+
+using pOpKernelType = paddle::framework::OpKernelType;
+
+bool is_xpu_support_op(std::string op_name, const pOpKernelType& type);
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 58471dd04ac65..f362808a4b952 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -3,7 +3,7 @@
 include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
 
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
-  feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
+  feed_fetch_method pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index b23132f55d7b4..b540d459c2629 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1400,20 +1400,26 @@ void BindImperative(py::module *m_ptr) {
 
       )DOC")
       .def("cuda",
-           [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
-              bool blocking) {
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              py::handle &handle, bool blocking) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot copy this Tensor to GPU in CPU version Paddle, "
                  "Please recompile or reinstall Paddle with CUDA support."));
 #else
              int device_count = platform::GetCUDADeviceCount();
-             if (device_id == -1) {
+             int device_id = 0;
+             if (handle == py::none()) {
                if (platform::is_gpu_place(self->Place())) {
                  return self;
-               } else {
-                 device_id = 0;
                }
+             } else {
+               PyObject *py_obj = handle.ptr();
+               PADDLE_ENFORCE_EQ(
+                   PyCheckInteger(py_obj), true,
+                   platform::errors::InvalidArgument(
+                       " 'device_id' must be a positive integer"));
+               device_id = py::cast<int>(handle);
              }
              PADDLE_ENFORCE_GE(
                  device_id, 0,
@@ -1437,26 +1443,30 @@ void BindImperative(py::module *m_ptr) {
              }
 #endif
            },
-           py::arg("device_id") = -1, py::arg("blocking") = true, R"DOC(
+           py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
         If this Tensor is already in GPU memory and device_id is default, 
         then no copy is performed and the original Tensor is returned.
         
         Args:
-            device_id(int, optional): The destination GPU device id. Defaults to the current device.
+            device_id(int, optional): The destination GPU device id. Default: None, means current device.
             blocking(bool, optional): If False and the source is in pinned memory, the copy will be 
               asynchronous with respect to the host. Otherwise, the argument has no effect. Default: False.
 
         Examples:
             .. code-block:: python
 
+              # required: gpu
               import paddle
               x = paddle.to_tensor(1.0, place=paddle.CPUPlace())
               print(x.place)        # CPUPlace
 
               y = x.cuda()
               print(y.place)        # CUDAPlace(0)
+            
+              y = x.cuda(None)
+              print(y.place)        # CUDAPlace(0)
 
               y = x.cuda(1)
               print(y.place)        # CUDAPlace(1)
diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc
index fc8d7ac949a02..4a4c34b149e40 100644
--- a/paddle/fluid/pybind/ir.cc
+++ b/paddle/fluid/pybind/ir.cc
@@ -23,7 +23,9 @@
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "pybind11/stl.h"
@@ -184,5 +186,150 @@ void BindNode(py::module *m) {
       .value("Variable", Node::Type::kVariable)
       .export_values();
 }
+
+class PYBIND11_HIDDEN PassAttrGetterSetterRegistry {
+ private:
+  PassAttrGetterSetterRegistry() = default;
+  DISABLE_COPY_AND_ASSIGN(PassAttrGetterSetterRegistry);
+
+  using Getter = std::function<py::object(const framework::ir::Pass & /*pass*/,
+                                          const std::string & /*attr_name*/)>;
+  using Setter = std::function<void(const std::string & /*attr_name*/,
+                                    const py::object & /*attr_value*/,
+                                    framework::ir::Pass * /*pass*/)>;
+
+  struct GetterSetter {
+    Getter getter;
+    Setter setter;
+  };
+
+ public:
+  static PassAttrGetterSetterRegistry &Instance() {
+    static PassAttrGetterSetterRegistry instance;
+    return instance;
+  }
+
+  void Register(const std::string &attr_type, Getter getter, Setter setter) {
+    PADDLE_ENFORCE_NOT_NULL(
+        getter, platform::errors::InvalidArgument(
+                    "getter of %s should not be nullptr", attr_type));
+    PADDLE_ENFORCE_NOT_NULL(
+        setter, platform::errors::InvalidArgument(
+                    "setter of %s should not be nullptr", attr_type));
+    GetterSetter getter_setter;
+    getter_setter.getter = std::move(getter);
+    getter_setter.setter = std::move(setter);
+    PADDLE_ENFORCE_EQ(
+        getter_setter_map_.emplace(attr_type, getter_setter).second, true,
+        platform::errors::InvalidArgument(
+            "getter and setter of %s have been set before", attr_type));
+  }
+
+  py::object Get(const framework::ir::Pass &pass, const std::string &attr_name,
+                 const std::string &attr_type) const {
+    auto iter = getter_setter_map_.find(attr_type);
+    PADDLE_ENFORCE_EQ(
+        iter != getter_setter_map_.end(), true,
+        platform::errors::InvalidArgument("unsupported attribute type %s of %s",
+                                          attr_type, attr_name));
+    const auto &getter = iter->second.getter;
+    return getter(pass, attr_name);
+  }
+
+  void Set(const std::string &attr_name, const std::string &attr_type,
+           const py::object &attr_value, framework::ir::Pass *pass) const {
+    auto iter = getter_setter_map_.find(attr_type);
+    PADDLE_ENFORCE_EQ(
+        iter != getter_setter_map_.end(), true,
+        platform::errors::InvalidArgument("unsupported attribute type %s of %s",
+                                          attr_type, attr_name));
+    const auto &setter = iter->second.setter;
+    setter(attr_name, attr_value, pass);
+  }
+
+ private:
+  std::unordered_map<std::string, GetterSetter> getter_setter_map_;
+};
+
+#define REGISTER_PASS_ATTR_GETTER_SETTER(attr_type_name, cpp_type)             \
+  do {                                                                         \
+    auto getter = [](const framework::ir::Pass &pass,                          \
+                     const std::string &attr_name) -> py::object {             \
+      auto attr_value = pass.Get<cpp_type>(attr_name);                         \
+      return py::cast(attr_value);                                             \
+    };                                                                         \
+    auto setter = [](const std::string &attr_name,                             \
+                     const py::object &attr_value,                             \
+                     framework::ir::Pass *pass) {                              \
+      PADDLE_ENFORCE_NOT_NULL(                                                 \
+          pass, platform::errors::InvalidArgument("pass should be provided")); \
+      try {                                                                    \
+        const auto &cpp_attr_value = py::cast<cpp_type>(attr_value);           \
+        pass->Set(attr_name, new cpp_type(cpp_attr_value));                    \
+      } catch (py::cast_error &) {                                             \
+        PADDLE_THROW(platform::errors::InvalidArgument(                        \
+            "type error of attribute %s, expected to be %s", attr_name,        \
+            attr_type_name));                                                  \
+      }                                                                        \
+    };                                                                         \
+    PassAttrGetterSetterRegistry::Instance().Register(attr_type_name, getter,  \
+                                                      setter);                 \
+  } while (0)
+
+// NOTE: attr_types may be changed
+static void SetAttrsToPass(
+    const std::unordered_map<std::string, py::object> &attrs,
+    std::unordered_map<std::string, std::string> *attr_types,
+    framework::ir::Pass *pass) {
+  for (const auto &name_and_value : attrs) {
+    const auto &attr_name = name_and_value.first;
+    const auto &attr_value = name_and_value.second;
+    auto &attr_type = (*attr_types)[attr_name];
+    if (attr_type.empty()) {
+      attr_type = py::cast<std::string>(attr_value.get_type().attr("__name__"));
+    }
+    PassAttrGetterSetterRegistry::Instance().Set(attr_name, attr_type,
+                                                 attr_value, pass);
+  }
+}
+
+void BindPass(py::module *m) {
+  // NOTE: pass_attr_types is a dict to indicate the type of each attribute.
+  // Python has only one integral type "int", but C++ has many integral types.
+  // If pass_attrs = {"nranks": 1} in Python, we cannot know whether the type
+  // of "nranks" is size_t or int in C++. Therefore, users can set
+  // pass_attr_types to indicate the type of "nranks" explicitly,
+  // i.e. pass_attr_types = {"nranks": "size_t"} means that the type of
+  // "nranks" is size_t in C++.
+  REGISTER_PASS_ATTR_GETTER_SETTER("int", int64_t);
+  REGISTER_PASS_ATTR_GETTER_SETTER("long", int64_t);
+  REGISTER_PASS_ATTR_GETTER_SETTER("size_t", size_t);
+  REGISTER_PASS_ATTR_GETTER_SETTER("float32", float);
+  // Python float is C++ double
+  REGISTER_PASS_ATTR_GETTER_SETTER("float", double);
+  REGISTER_PASS_ATTR_GETTER_SETTER("bytes", std::string);
+  REGISTER_PASS_ATTR_GETTER_SETTER("str", std::string);
+
+  m->def(
+      "apply_pass",
+      [](framework::ProgramDesc *main_program,
+         framework::ProgramDesc *startup_program, const std::string &pass_name,
+         const std::unordered_map<std::string, py::object> &pass_attrs,
+         std::unordered_map<std::string, std::string> pass_attr_types) {
+        auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+        SetAttrsToPass(pass_attrs, &pass_attr_types, pass.get());
+        pass->Apply(main_program, startup_program);
+        std::unordered_map<std::string, py::object> result_attrs;
+        for (const auto &name_and_value : pass_attrs) {
+          const auto &attr_name = name_and_value.first;
+          const auto &attr_type = pass_attr_types.at(attr_name);
+          result_attrs[attr_name] =
+              PassAttrGetterSetterRegistry::Instance().Get(*pass, attr_name,
+                                                           attr_type);
+        }
+        return result_attrs;
+      });
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/ir.h b/paddle/fluid/pybind/ir.h
index 5bee70eba695b..2cc1459bbe0fe 100644
--- a/paddle/fluid/pybind/ir.h
+++ b/paddle/fluid/pybind/ir.h
@@ -21,5 +21,6 @@ namespace paddle {
 namespace pybind {
 void BindGraph(pybind11::module *m);
 void BindNode(pybind11::module *m);
+void BindPass(pybind11::module *m);
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 2cda20959178c..b58e9050402bb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu_info.h"
+#include "paddle/fluid/platform/xpu/xpu_info.h"
 #endif
 
 #ifdef PADDLE_WITH_CRYPTO
@@ -3105,6 +3105,7 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
   BindGraph(&m);
   BindNode(&m);
+  BindPass(&m);
   BindInferenceApi(&m);
   BindCompatible(&m);
   BindDataset(&m);
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 60c17eae851d5..a70c9ca996303 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -75,6 +75,7 @@ if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
 if not defined retry_times set retry_times=3
 if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
+if not defined BUILD_DIR set BUILD_DIR=build
 
 rem ------initialize the python environment------
 set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe
@@ -91,16 +92,16 @@ if "%WITH_PYTHON%" == "ON" (
 )
 
 rem -------Caching strategy 1: keep build directory for incremental compilation-----------
-rmdir build\python /s/q
-rmdir build\paddle\third_party\externalError /s/q
-rem rmdir build\paddle\fluid\pybind /s/q
-rmdir build\paddle_install_dir /s/q
-rmdir build\paddle_inference_install_dir /s/q
-rmdir build\paddle_inference_c_install_dir /s/q
-del build\CMakeCache.txt
+rmdir %BUILD_DIR%\python /s/q
+rmdir %BUILD_DIR%\paddle\third_party\externalError /s/q
+rem rmdir %BUILD_DIR%\paddle\fluid\pybind /s/q
+rmdir %BUILD_DIR%\paddle_install_dir /s/q
+rmdir %BUILD_DIR%\paddle_inference_install_dir /s/q
+rmdir %BUILD_DIR%\paddle_inference_c_install_dir /s/q
+del %BUILD_DIR%\CMakeCache.txt
 
 if "%WITH_CACHE%"=="OFF" (
-    rmdir build /s/q
+    rmdir %BUILD_DIR% /s/q
     goto :mkbuild
 )
 
@@ -108,7 +109,7 @@ set error_code=0
 type %cache_dir%\error_code.txt
 : set /p error_code=< %cache_dir%\error_code.txt
 if %error_code% NEQ 0 (
-    rmdir build /s/q
+    rmdir %BUILD_DIR% /s/q
     goto :mkbuild
 )
 
@@ -118,12 +119,12 @@ if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
     git diff HEAD last_pr --stat --name-only | findstr "setup.py.in"
     if !ERRORLEVEL! EQU 0 (
-        rmdir build /s/q
+        rmdir %BUILD_DIR% /s/q
     )
     git branch -D last_pr
     git branch last_pr
 ) else (
-    rmdir build /s/q
+    rmdir %BUILD_DIR% /s/q
     git branch last_pr
 )
 
@@ -134,21 +135,21 @@ set /p day_before=< %cache_dir%\day.txt
 if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
-    rmdir build /s/q
+    rmdir %BUILD_DIR% /s/q
     goto :mkbuild
 )
 
 :mkbuild
-if not exist build (
+if not exist %BUILD_DIR% (
     echo Windows build cache FALSE
     set Windows_Build_Cache=FALSE
-    mkdir build
+    mkdir %BUILD_DIR%
 ) else (
     echo Windows build cache TRUE
     set Windows_Build_Cache=TRUE
 )
 echo ipipe_log_param_Windows_Build_Cache: %Windows_Build_Cache%
-cd /d build
+cd /d %BUILD_DIR%
 dir .
 dir %cache_dir%
 dir paddle\fluid\pybind\Release
@@ -162,6 +163,8 @@ if "%WITH_SCCACHE%"=="ON" (
     sccache --stop-server 2> NUL
     if not exist D:\sccache mkdir D:\sccache
     set SCCACHE_DIR=D:\sccache\.cache
+    :: sccache will shut down if a source file takes more than 10 mins to compile
+    set SCCACHE_IDLE_TIMEOUT=0
     set SCCACHE_CACHE_SIZE=30G
     set SCCACHE_ERROR_LOG=D:\sccache\sccache_log.txt
     set SCCACHE_LOG=quiet
@@ -340,7 +343,7 @@ if %day_now% NEQ %day_before% (
 )
 
 if "%WITH_TPCACHE%"=="OFF" (
-    set THIRD_PARTY_PATH=%work_dir:\=/%/build/third_party
+    set THIRD_PARTY_PATH=%work_dir:\=/%/%BUILD_DIR%/third_party
     goto :cmake_impl
 )
 
@@ -500,15 +503,6 @@ for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "Build"
 
-tree /F %cd%\paddle_inference_install_dir\paddle
-%cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
-set /p libsize=< lib_size.txt
-for /F %%i in ("%libsize%") do (
-    set /a libsize_m=%%i/1024
-    echo "Windows Paddle_Inference Size: !libsize_m!M"
-    echo ipipe_log_param_Windows_Paddle_Inference_Size: !libsize_m!M
-)
-
 %cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
 set /p whlsize=< whl_size.txt
 for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
@@ -527,7 +521,6 @@ if %ERRORLEVEL% NEQ 0 (
     exit /b 1
 )
 
-
 set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
@@ -633,6 +626,15 @@ set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "1 card TestCases Total"
 call :timestamp "%start%" "%end%" "TestCases Total"
 
+tree /F %cd%\paddle_inference_install_dir\paddle
+%cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
+set /p libsize=< lib_size.txt
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference Size: !libsize_m!M"
+    echo ipipe_log_param_Windows_Paddle_Inference_Size: !libsize_m!M
+)
+
 cd %work_dir%\paddle\fluid\inference\api\demo_ci
 %cache_dir%\tools\busybox64.exe bash run.sh %work_dir:\=/% %WITH_MKL% %WITH_GPU% %cache_dir:\=/%/inference_demo %TENSORRT_ROOT%/include %TENSORRT_ROOT%/lib %MSVC_STATIC_CRT%
 goto:eof
@@ -650,7 +652,7 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
-cd /d %work_dir%\build
+cd /d %work_dir%\%BUILD_DIR%
 echo set -e>  check_change_of_unittest.sh
 echo set +x>> check_change_of_unittest.sh
 echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
@@ -728,7 +730,7 @@ exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :zip_cc_file
-cd /d %work_dir%\build
+cd /d %work_dir%\%BUILD_DIR%
 tree /F %cd%\paddle_inference_install_dir\paddle
 if exist paddle_inference.zip del paddle_inference.zip
 python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')"
@@ -746,7 +748,7 @@ exit /b 1
 
 rem ---------------------------------------------------------------------------------------------
 :zip_c_file
-cd /d %work_dir%\build
+cd /d %work_dir%\%BUILD_DIR%
 tree /F %cd%\paddle_inference_c_install_dir\paddle
 if exist paddle_inference_c.zip del paddle_inference_c.zip
 python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')"
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index 004b3fb0f666b..5b8d185212c23 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -156,6 +156,10 @@ def __init__(self, topology):
         self.is_first_stage = (self.stage_id == 0)
         self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
 
+        # create p2p_groups
+        if self._pp_degree > 1:
+            self._set_p2p_group()
+
         debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
                     "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
                     self._sharding_degree, self._pp_degree, self._dp_degree)
@@ -164,27 +168,9 @@ def __init__(self, topology):
             self._dp_group, self._check_group)
         logger.info(debug_str)
 
-        # create p2p_groups and no new group
-        self._p2p_groups = self._build_p2p_lists()
-
         global _HYBRID_PARALLEL_GROUP
         _HYBRID_PARALLEL_GROUP = self
 
-    def _build_p2p_lists(self):
-        comm_lists = self._topo.get_comm_list('pipe')
-        p2p_lists = []
-        for rank in range(self.nranks):
-            for comm_ranks in comm_lists:
-                assert len(comm_ranks) == self._pp_degree
-                if rank in comm_ranks:
-                    idx = comm_ranks.index(rank)
-                    next_rank = comm_ranks[(idx + 1) % self._pp_degree]
-                    p2p_lists.append([rank, next_rank])
-                    break
-        assert len(
-            p2p_lists) == self.nranks, "len(p2p_lists) should be equal nranks"
-        return p2p_lists
-
     def get_parallel_mode(self):
         # there are four modes : DataParallel / TensorParallel / PipelineParallel / ShardingParallel
         # NOTE when sharding conjugates with other parallel, sharding should act like a optimizer and 
@@ -236,6 +222,41 @@ def _set_check_group(self, parallel_method="data"):
 
         return parallel_group, parallel_comm_group
 
+    def _set_p2p_group(self):
+        comm_lists = self._topo.get_comm_list('pipe')
+
+        self.send_next_group = None
+        self.send_prev_group = None
+        self.recv_next_group = None
+        self.recv_prev_group = None
+
+        for comm_ranks in comm_lists:
+            assert len(comm_ranks) == self._pp_degree
+            for idx, rank in enumerate(comm_ranks):
+                curr_rank = rank
+                next_rank = comm_ranks[(idx + 1) % self._pp_degree]
+                prev_rank = comm_ranks[(idx - 1) % self._pp_degree]
+
+                next_group = paddle.distributed.new_group(
+                    ranks=[curr_rank, next_rank])
+                if self.global_rank == curr_rank:
+                    self.send_next_group = next_group
+                elif self.global_rank == next_rank:
+                    self.recv_prev_group = next_group
+
+                prev_group = paddle.distributed.new_group(
+                    ranks=[prev_rank, curr_rank])
+
+                if self.global_rank == curr_rank:
+                    self.send_prev_group = prev_group
+                elif self.global_rank == prev_rank:
+                    self.recv_next_group = prev_group
+
+        assert self.send_next_group is not None
+        assert self.send_prev_group is not None
+        assert self.recv_next_group is not None
+        assert self.recv_prev_group is not None
+
     def topology(self):
         return self._topo
 
@@ -287,6 +308,9 @@ def get_pipe_parallel_world_size(self):
     def get_pipe_parallel_group(self):
         return self._pp_comm_group
 
+    def get_p2p_groups(self):
+        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+
     # sharding parallel message:
     def _get_sharding_parallel_id(self):
         return self._topo.get_coord(self.global_rank).sharding
@@ -304,9 +328,6 @@ def get_sharding_parallel_group_src_rank(self):
         # TODO should the src rank related to the shard rank for each parameter ?
         return self._sharding_comm_group.ranks[0]
 
-    def get_p2p_groups(self):
-        return self._p2p_groups
-
     # check parallel group
     def get_check_parallel_group(self):
         return self._check_comm_group
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 9e891062bcbcc..a44607d13aafc 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -126,11 +126,11 @@ def _add_sync_by_allreduce(block):
             _add_sync_by_allreduce(block)
             return
 
+        comm_id_var = block.create_var(
+            name=unique_name.generate('comm_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
         if core.is_compiled_with_cuda():
-            comm_id_var = block.create_var(
-                name=unique_name.generate('nccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
             block.append_op(
                 type='c_gen_nccl_id',
                 inputs={},
@@ -139,6 +139,7 @@ def _add_sync_by_allreduce(block):
                     'rank': rank,
                     'endpoint': current_endpoint,
                     'other_endpoints': other_endpoints,
+                    'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
             block.append_op(
@@ -152,10 +153,6 @@ def _add_sync_by_allreduce(block):
                     OP_ROLE_KEY: OpRole.Forward
                 })
         elif core.is_compiled_with_xpu():
-            comm_id_var = block.create_var(
-                name=unique_name.generate('bkcl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
             block.append_op(
                 type='c_gen_bkcl_id',
                 inputs={},
@@ -164,6 +161,7 @@ def _add_sync_by_allreduce(block):
                     'rank': rank,
                     'endpoint': current_endpoint,
                     'other_endpoints': other_endpoints,
+                    'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
             block.append_op(
@@ -177,24 +175,20 @@ def _add_sync_by_allreduce(block):
                     OP_ROLE_KEY: OpRole.Forward
                 })
         elif core.is_compiled_with_npu():
-            hccl_id_var = block.create_var(
-                name=unique_name.generate('hccl_id'),
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
             block.append_op(
                 type='c_gen_hccl_id',
                 inputs={},
-                outputs={'Out': hccl_id_var},
+                outputs={'Out': comm_id_var},
                 attrs={
                     'rank': rank,
                     'endpoint': current_endpoint,
                     'other_endpoints': other_endpoints,
+                    'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
             block.append_op(
                 type='c_comm_init_hccl',
-                inputs={'X': hccl_id_var},
+                inputs={'X': comm_id_var},
                 outputs={},
                 attrs={
                     'rank': rank,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
index 8e63635372984..07272404768ff 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -73,7 +73,7 @@ def remove_cast_op(block, params, segment, offset):
         return inserted_op_num
 
     @staticmethod
-    def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
+    def prune_fp16(block, shard, reduced_grads_to_param, ring_ids):
         """
         1. prune all cast_fp16_to_fp32 ops if the param not belongs to this shard
         2. revise amp inifine grad checking for sharding
@@ -146,6 +146,7 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
             name=inf_var_name + "@sharding",
             shape=inf_var.shape,
             dtype=inf_var.dtype)
+
         block._insert_op_without_sync(
             update_loss_scaling_op_idx,
             type='cast',
@@ -156,19 +157,26 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
                 "out_dtype": inf_var_int32.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
-        # this allreduce communication should not overlap with calc
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 1,
-            type='c_allreduce_max',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
+        update_loss_scaling_op_idx += 1
+
+        # allreduce(mp)->allreduce(sharding)->allreduce(pp)
+        for ring_id in ring_ids:
+            if ring_id == -1: continue
+            # this allreduce communication should not overlap with calc
+            block._insert_op_without_sync(
+                update_loss_scaling_op_idx,
+                type='c_allreduce_max',
+                inputs={'X': inf_var_int32},
+                outputs={'Out': inf_var_int32},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+            update_loss_scaling_op_idx += 1
+
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 2,
+            update_loss_scaling_op_idx,
             type='cast',
             inputs={'X': inf_var_int32},
             outputs={'Out': inf_var_sharding},
@@ -177,11 +185,12 @@ def prune_fp16(block, shard, reduced_grads_to_param, ring_id):
                 "out_dtype": inf_var_sharding.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
+        update_loss_scaling_op_idx += 1
         block._sync_with_cpp()
 
     # TODO (JZ-LIANG) revise this for uniform mixed parallelism
     @staticmethod
-    def sync_amp_check_nan_inf(block, ring_id):
+    def sync_amp_check_nan_inf(block, ring_ids):
         update_loss_scaling_op_idx = -1
 
         for idx, op in reversed(list(enumerate(block.ops))):
@@ -189,10 +198,14 @@ def sync_amp_check_nan_inf(block, ring_id):
                 update_loss_scaling_op_idx = idx
                 inf_var_name = op.desc.input('FoundInfinite')[0]
                 op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD")
+                break
 
         # not use amp
         if update_loss_scaling_op_idx == -1:
             return
+        # 0. inf_var_int32 = cast(inf_var)
+        # 1. inf_var_int32 = allreduce_max(inf_var_int32)
+        # 3. inf_var = cast(inf_var_int32)
         inf_var = block.var(inf_var_name)
         inf_var_int32 = block.create_var(
             name=inf_var_name + "@cast_int32",
@@ -212,18 +225,25 @@ def sync_amp_check_nan_inf(block, ring_id):
                 "out_dtype": inf_var_int32.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
+        update_loss_scaling_op_idx += 1
+
+        # allreduce(mp)->allreduce(pp)
+        for ring_id in ring_ids:
+            if ring_id == -1: continue
+            block._insert_op_without_sync(
+                update_loss_scaling_op_idx,
+                type='c_allreduce_max',
+                inputs={'X': inf_var_int32},
+                outputs={'Out': inf_var_int32},
+                attrs={
+                    'ring_id': ring_id,
+                    'use_calc_stream': True,
+                    OP_ROLE_KEY: OpRole.Optimize
+                })
+            update_loss_scaling_op_idx += 1
+
         block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 1,
-            type='c_allreduce_max',
-            inputs={'X': inf_var_int32},
-            outputs={'Out': inf_var_int32},
-            attrs={
-                'ring_id': ring_id,
-                'use_calc_stream': True,
-                OP_ROLE_KEY: OpRole.Optimize
-            })
-        block._insert_op_without_sync(
-            update_loss_scaling_op_idx + 2,
+            update_loss_scaling_op_idx,
             type='cast',
             inputs={'X': inf_var_int32},
             outputs={'Out': inf_var_global},
@@ -232,4 +252,5 @@ def sync_amp_check_nan_inf(block, ring_id):
                 "out_dtype": inf_var_global.dtype,
                 OP_ROLE_KEY: OpRole.Optimize
             })
+        update_loss_scaling_op_idx += 1
         block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
index fd74f28b69e19..e3d344dca25b3 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -25,7 +25,7 @@ def _is_gradient_clip_op(self, op):
         return op.desc.has_attr("op_namescope") \
             and op.desc.attr("op_namescope").startswith("/gradient_clip")
 
-    def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
+    def prune_gradient_clip(self, block, shard, ring_ids):
         """
         prune gradient_clip related ops for params that not belong to cur shard
         prune: square, reduce_sum, elementwise_mul
@@ -82,33 +82,23 @@ def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
                 assert (len(op.desc.output_arg_names()) == 1)
                 sum_res = op.desc.output_arg_names()[0]
 
-                # this allreduce should not overlap with calc and should be scheduled in calc stream
-                block._insert_op_without_sync(
-                    idx + 1,
-                    type='c_allreduce_sum',
-                    inputs={'X': sum_res},
-                    outputs={'Out': sum_res},
-                    attrs={
-                        'ring_id': self.mp_ring_id,
-                        'op_namescope': "/gradient_clip_model_parallelism",
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Optimize,
-                    })
-
-                # global norm should only be sum within each model parallelism word size when use global group
-                if pure_dp_degree > 1:
+                # allreduce(mp)->allreduce(sharding)->allreduce(pp)
+                idx_offset = 1
+                for ring_id in ring_ids:
+                    if ring_id == -1: continue
+                    # this allreduce should not overlap with calc and should be scheduled in calc stream
                     block._insert_op_without_sync(
-                        idx + 2,
-                        type='scale',
+                        idx + idx_offset,
+                        type='c_allreduce_sum',
                         inputs={'X': sum_res},
                         outputs={'Out': sum_res},
                         attrs={
-                            'scale': 1.0 / float(pure_dp_degree),
+                            'ring_id': ring_id,
                             'op_namescope': "/gradient_clip_model_parallelism",
-                            'bias': 0.0,
-                            'bias_after_scale': False,
-                            OP_ROLE_KEY: OpRole.Optimize
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize,
                         })
+                    idx_offset += 1
 
         # the grad sum here should take the all and only param in the current shard
         to_check_param = set(reversed_x_paramname)
@@ -126,43 +116,32 @@ def prune_gradient_clip(self, block, shard, pure_dp_degree=1):
         return
 
     # TODO (JZ-LIANG) revise this for uniform mixed parallelism
-    def sync_global_norm(self, block, ring_id, pure_dp_degree=1):
+    def sync_global_norm(self, block, ring_ids):
         """
         prune gradient_clip related ops for params that not belong to cur shard
         prune: square, reduce_sum, elementwise_mul
         keep: sum, sqrt, elementwise_max, elementwise_div
         """
+        # FIXME(wangxi): mp should prune duplicated param_grads
         for idx, op in reversed(list(enumerate(block.ops))):
             if not self._is_gradient_clip_op(op):
                 continue
 
             if op.type == "sum":
                 sum_res = op.desc.output_arg_names()[0]
-                block._insert_op_without_sync(
-                    idx + 1,
-                    type='c_allreduce_sum',
-                    inputs={'X': sum_res},
-                    outputs={'Out': sum_res},
-                    attrs={
-                        'ring_id': ring_id,
-                        'op_namescope': "/gradient_clip_model_parallelism",
-                        'use_calc_stream': True,
-                        OP_ROLE_KEY: OpRole.Optimize,
-                    })
-
-                # global norm should only be sum within each model parallelism word size
-                if pure_dp_degree > 1:
+                for ring_id in ring_ids:
+                    if ring_id == -1: continue
+
+                    idx = idx + 1
                     block._insert_op_without_sync(
-                        idx + 2,
-                        type='scale',
+                        idx,
+                        type='c_allreduce_sum',
                         inputs={'X': sum_res},
                         outputs={'Out': sum_res},
                         attrs={
-                            'scale': 1.0 / float(pure_dp_degree),
+                            'ring_id': ring_id,
                             'op_namescope': "/gradient_clip_model_parallelism",
-                            'bias': 0.0,
-                            'bias_after_scale': False,
-                            OP_ROLE_KEY: OpRole.Optimize
+                            'use_calc_stream': True,
+                            OP_ROLE_KEY: OpRole.Optimize,
                         })
-
-        return
+                return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 8a591120c0289..df775247c8c9e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -328,13 +328,17 @@ def minimize_impl(self,
         # if not use sharding, adapt amp/clip, for remain parallelism.
         # cast --> amp --> clip --> opt
         if self.sharding_degree <= 1:
+            # FIXME(wangxi): mp should prune duplicated param_grads when calc
+            # amp inf_var & clip global_norm_var
+
             # amp
-            FP16Utils.sync_amp_check_nan_inf(main_block, self.global_ring_id)
+            FP16Utils.sync_amp_check_nan_inf(
+                main_block, [self.mp_ring_id, self.pp_ring_id])
 
             # clip
-            gradientclip_helper = GradientClipHelper(self.global_ring_id)
+            gradientclip_helper = GradientClipHelper(None)
             gradientclip_helper.sync_global_norm(
-                main_block, self.global_ring_id, self.dp_degree)
+                main_block, [self.mp_ring_id, self.pp_ring_id])
 
         # step6: loss div dp_degree 
         global_dp_degree = self.sharding_degree * self.dp_degree
@@ -392,7 +396,6 @@ def _init_pair_comm(self, pair, ring_id):
             pp_rank,
             ring_id,
             False,
-            global_ring_id=self.global_ring_id,
             sync=False)
 
     def _init_npu_pipeline_comm(self, startup_block):
@@ -426,8 +429,6 @@ def _init_npu_pipeline_comm(self, startup_block):
         pair = send_to_next_pair if even else recv_from_prev_pair
         ring_id = self.pp_ring_map[pair[0] * 1000 + pair[1]]
         self._init_pair_comm(pair, ring_id)
-        append_naive_sync(startup_block, self.startup_prog_sync_var,
-                          self.global_ring_id)
         my_pair.remove(pair)
         logger.info("pair0(even->odd): pp pair:{}, ring_id: {}".format(pair,
                                                                        ring_id))
@@ -436,8 +437,6 @@ def _init_npu_pipeline_comm(self, startup_block):
         pair = recv_from_next_pair if even else send_to_prev_pair
         ring_id = self.pp_ring_map[pair[0] * 1000 + pair[1]]
         self._init_pair_comm(pair, ring_id)
-        append_naive_sync(startup_block, self.startup_prog_sync_var,
-                          self.global_ring_id)
         my_pair.remove(pair)
         logger.info("pair1(even<-odd): pp pair:{}, ring_id: {}".format(pair,
                                                                        ring_id))
@@ -450,8 +449,6 @@ def _init_npu_pipeline_comm(self, startup_block):
                 pair[0] * 1000 + pair[1],
                 max_ring_id + 1)  # 3->0 not in pp_ring_map
             self._init_pair_comm(pair, ring_id)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
             if self.pp_rank != 0 and self.pp_rank != self.pp_degree - 1:
                 my_pair.remove(pair)
             logger.info("pair2(odd->even): pp pair:{}, ring_id: {}".format(
@@ -463,8 +460,6 @@ def _init_npu_pipeline_comm(self, startup_block):
                 pair[0] * 1000 + pair[1],
                 max_ring_id + 2)  # 0->3 not in pp_ring_map
             self._init_pair_comm(pair, ring_id)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
             if self.pp_rank != 0 and self.pp_rank != self.pp_degree - 1:
                 my_pair.remove(pair)
             logger.info("pair3(odd<-even): pp pair:{}, ring_id: {}".format(
@@ -478,6 +473,15 @@ def _init_pipeline_comm(self, startup_block):
         assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format(
             self.pp_rank_, self.pp_rank)
 
+        self._collective_helper._init_communicator(
+            self._startup_program,
+            self.current_endpoint,
+            self.pp_group_endpoints,
+            self.pp_rank,
+            self.pp_ring_id,
+            False,
+            sync=False)
+
         if core.is_compiled_with_npu():
             self._init_npu_pipeline_comm(startup_block)
             return
@@ -489,8 +493,6 @@ def _init_pipeline_comm(self, startup_block):
             logger.info("pp pair:{}, ring_id: {}".format(pair, ring_id))
             if self.pp_rank in pair:
                 self._init_pair_comm(pair, ring_id)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
 
     def _init_comm(self):
 
@@ -505,19 +507,6 @@ def _init_comm(self):
             dtype=core.VarDesc.VarType.INT32,
             persistable=False)
 
-        # global ring
-        self._collective_helper._init_communicator(
-            self._startup_program,
-            self.current_endpoint,
-            self.global_endpoints,
-            self.global_rank,
-            self.global_ring_id,
-            False,
-            global_ring_id=self.global_ring_id,
-            sync=False)
-        append_naive_sync(startup_block, self.startup_prog_sync_var,
-                          self.global_ring_id)
-
         # mp ring
         if self.mp_degree > 1:
             self._collective_helper._init_communicator(
@@ -527,10 +516,7 @@ def _init_comm(self):
                 self.mp_rank,
                 self.mp_ring_id,
                 False,
-                global_ring_id=self.global_ring_id,
                 sync=False)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
 
         # sharding ring
         if self.sharding_degree > 1:
@@ -541,10 +527,7 @@ def _init_comm(self):
                 self.sharding_rank,
                 self.sharding_ring_id,
                 False,
-                global_ring_id=self.global_ring_id,
                 sync=False)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
 
         # pp ring
         if self.pp_degree > 1:
@@ -559,10 +542,7 @@ def _init_comm(self):
                 self.dp_rank,
                 self.dp_ring_id,
                 False,
-                global_ring_id=self.global_ring_id,
                 sync=False)
-            append_naive_sync(startup_block, self.startup_prog_sync_var,
-                              self.global_ring_id)
 
         startup_block._sync_with_cpp()
 
@@ -736,21 +716,20 @@ def _prune_main_program(self, block):
         """
         weightdecay_helper = WeightDecayHelper()
         weightdecay_helper.prune_weight_decay(block, self._shard)
+
+        # FIXME(wangxi): mp should prune duplicated param_grads
         # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism
         # group. and each Data Parallelism group should have its own sync of FoundInfinite
         # amp could use global group for sync
-        FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param,
-                             self.global_ring_id)
+        FP16Utils.prune_fp16(
+            block, self._shard, self._reduced_grads_to_param,
+            [self.mp_ring_id, self.sharding_ring_id, self.pp_ring_id])
+
         # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp)
-        if self.mp_degree * self.pp_degree == 1:
-            # separate the sharding-hybrid senario to keep the accuracy
-            gradientclip_helper = GradientClipHelper(self.sharding_ring_id)
-            gradientclip_helper.prune_gradient_clip(
-                block, self._shard, pure_dp_degree=1)
-        else:
-            gradientclip_helper = GradientClipHelper(self.global_ring_id)
-            gradientclip_helper.prune_gradient_clip(
-                block, self._shard, pure_dp_degree=self.dp_degree)
+        gradientclip_helper = GradientClipHelper(None)
+        gradientclip_helper.prune_gradient_clip(
+            block, self._shard,
+            [self.mp_ring_id, self.sharding_ring_id, self.pp_ring_id])
 
         # build prog deps
         reduced_grads = []
@@ -1143,7 +1122,9 @@ def _build_groups(self):
 
         # pp
         if self.pp_degree > 1:
-            self.pp_ring_id = 20
+            self.pp_pair_ring_id = 20
+            # pipeline global ring_id set to 4 for sharding0, mp1, dp2, global3
+            self.pp_ring_id = 4
             self.pp_rank = self.global_rank // (self.sharding_degree *
                                                 self.mp_degree) % self.pp_degree
             # (NOTE): Already adjust for (outter-pure) dp
@@ -1159,8 +1140,9 @@ def _build_groups(self):
                     pp_first_stage_idx + pp_stage_offset * i])
             assert self.current_endpoint in self.pp_group_endpoints
         else:
-            self.pp_degree = 1
             self.pp_ring_id = -1
+            self.pp_degree = 1
+            self.pp_pair_ring_id = -1
             self.pp_rank = -1
             self.pp_group_id = -1
             self.pp_group_endpoints = []
@@ -1256,9 +1238,6 @@ def _initialization_broadcast(self, startup_block):
             outputs={'Out': params},
             attrs={'ring_id': self.dp_ring_id,
                    OP_ROLE_KEY: OpRole.Forward})
-        # sync within global group
-        append_naive_sync(startup_block, self.startup_prog_sync_var,
-                          self.global_ring_id)
 
     # sharding gradient merge
     def create_persistable_gradients_and_insert_merge_ops(
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index b31b2939695b3..a3c6a5b5fb665 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import math
 import paddle
+import re
 from paddle.fluid.dygraph.layers import Layer
 from ...utils.log_util import logger, layer_to_str
 from functools import partial
@@ -20,27 +21,6 @@
 __all__ = []
 
 
-class SegmentLayers(object):
-    def __init__(self, layers_desc, num_parts, method="uniform"):
-        self._layers_desc = layers_desc
-        self.method = method
-        self.num_parts = num_parts
-        self.num_items = len(layers_desc)
-        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
-
-    def do_segment(self):
-        if self.method == "uniform":
-            return self.uniform(self.num_items, self.num_parts)
-
-    def uniform(self, num_items, num_parts):
-        result = [0 for _ in range(num_parts + 1)]
-        part_size = math.floor(num_items / num_parts)
-        for i in range(num_parts):
-            result[i] = int(min(part_size * i, num_items))
-        result[num_parts] = num_items
-        return result
-
-
 class LayerDesc(object):
     def __init__(self, layer_func, *inputs, **kwargs):
         self.layer_func = layer_func
@@ -73,6 +53,75 @@ def __init__(self,
         self.shared_weight_attr = shared_weight_attr
 
 
+class SegmentLayers(object):
+    def __init__(self, layers_desc, num_parts, method="uniform"):
+        self._layers_desc = layers_desc
+        self.method = method
+        self.num_parts = num_parts
+        self.num_items = len(layers_desc)
+        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+
+    def do_segment(self):
+        if self.method == "uniform":
+            return self.uniform(self.num_items, self.num_parts)
+
+        elif self.method.startswith('layer:'):
+            # Divide equally according to the specified layer
+            layername = self.method.split(':')[1]
+            weights = [0] * len(self._layers_desc)
+            weight_idxs = self._gen_layer_weight(layername)
+            for idx in weight_idxs:
+                weights[idx] = 1
+
+            assert sum(
+                weights
+            ) % self.num_parts == 0, "number of layers ({}) should be divided by part number({})".format(
+                sum(weights), self.num_parts)
+            part_size = sum(weights) // self.num_parts
+            result = [0 for _ in range(self.num_parts + 1)]
+
+            memory_counter = 0
+            result_idx = 1
+            for idx, weight in enumerate(weights):
+                memory_counter += weight
+                if memory_counter == part_size:
+                    result[result_idx] = idx + 1
+                    result_idx += 1
+                    memory_counter = 0
+            result[self.num_parts] = len(weights)
+            return result
+
+    def _gen_layer_weight(self, layername):
+        weight_idxs = []
+        regex = re.compile(layername, re.IGNORECASE)
+        for idx, layer in enumerate(self._layers_desc):
+            name = None
+            if isinstance(layer, Layer):
+                name = layer.__class__.__name__
+            elif isinstance(layer, LayerDesc):
+                name = layer.layer_func.__name__
+            else:
+                try:
+                    name = layer.__name__
+                except AttributeError:
+                    # it is not error
+                    continue
+            if regex.search(name):
+                weight_idxs.append(idx)
+
+        assert len(
+            weight_idxs) > 0, "weight_idxs' length should be greater than 0"
+        return weight_idxs
+
+    def uniform(self, num_items, num_parts):
+        result = [0 for _ in range(num_parts + 1)]
+        part_size = math.floor(num_items / num_parts)
+        for i in range(num_parts):
+            result[i] = int(min(part_size * i, num_items))
+        result[num_parts] = num_items
+        return result
+
+
 class PipelineLayer(Layer):
     def __init__(self,
                  layers,
@@ -205,6 +254,9 @@ def _segment_network(self, seg_method):
             self._layers_desc, num_parts=self._num_stages, method=seg_method)
         self.segment_parts = seg.do_segment()
 
+        logger.info("segment result:" + ", ".join(
+            str(arg) for arg in self.segment_parts))
+
         self._start_pos = self.segment_parts[self._stage_id]
         self._end_pos = self.segment_parts[self._stage_id + 1]
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 9f2a4aaffb474..1cec106caec82 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -11,19 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 
-import numpy as np
-
 import paddle
 import paddle.fluid as fluid
 from .meta_parallel_base import MetaParallelBase
-from .pp_utils.utils import is_float_tensor, get_tensor_dtype, paddle_2_number, number_2_dtype
-from .pp_utils import utils
+from .pp_utils.utils import is_float_tensor
 from .parallel_layers.pp_layers import PipelineLayer
 
 from ..utils.hybrid_parallel_util import broadcast_mp_parameters
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
 from ..utils.log_util import logger
-from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer
+from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler
 from .pp_utils import p2p_communication as p2p
 
 __all__ = []
@@ -35,25 +32,9 @@ def __init__(self, layers, hcg, strategy):
             raise TypeError(
                 "The Layer should be a derived class of PipelineLayer.")
         super(PipelineParallel, self).__init__(layers, hcg, strategy)
-        self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
         self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
         self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
 
-        self.is_pipe_partitioned = self.use_model_parallel
-
-        self.num_caches = 0
-        self.caches = {
-            'inputs': [],
-            'labels': [],
-            'outputs': [],
-        }
-
-        self.recv_cache = None
-        self.grad_tensors = None
-
-        self.send_meta = True
-
-        self.current_loss = paddle.to_tensor(0.0)
         self.total_loss = None
 
         self.micro_batch_size = self._strategy.pipeline_configs[
@@ -63,17 +44,14 @@ def __init__(self, layers, hcg, strategy):
 
         self.num_stages = self._hcg.get_pipe_parallel_world_size()
         self.stage_id = self._hcg.get_stage_id()
-        self.prev_stage_id = self.stage_id - 1
-        self.next_stage_id = self.stage_id + 1
         self.pp_group = self._hcg.get_pipe_parallel_group()
+
         p2p.initialize_p2p_groups(hcg)
 
         self.is_first_stage = self.stage_id == 0
         self.is_last_stage = (self.stage_id == (self.num_stages - 1))
         self.global_rank = self._hcg.get_global_rank()
-
-        self.mp_degree = self._hcg.get_model_parallel_world_size()
-        self.mp_rank = self._hcg.get_model_parallel_rank()
+        self.micro_batch_id = 0
 
         logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
             self.num_stages, self.stage_id))
@@ -86,158 +64,160 @@ def __init__(self, layers, hcg, strategy):
             logger.info("start broadcast dp parameters")
             broadcast_dp_parameters(self._layers, self._hcg)
 
-    def _init_caches(self, num_caches):
-        if self.num_caches >= num_caches:
+    def _set_tensor_trainable(self, tensor):
+        if tensor is None:
             return
-        self.num_caches = num_caches - self.num_caches
-        for key in self.caches:
-            self.caches[key].extend([None] * self.num_caches)
 
-    def _reduce_final_loss(self):
-        if self.is_last_stage:
-            assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss"
-            loss = self.total_loss.clone() / self.accumulate_steps
-            paddle.distributed.broadcast(
-                loss,
-                src=self.global_rank,
-                use_calc_stream=True,
-                group=self.pp_group)
+        if isinstance(tensor, tuple):
+            for t in tensor:
+                if is_float_tensor(t):
+                    t.stop_gradient = False
         else:
-            loss = paddle.to_tensor(0.0)
-            paddle.distributed.broadcast(
-                loss,
-                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
-                use_calc_stream=True,
-                group=self.pp_group)
-        return loss
+            if is_float_tensor(tensor):
+                tensor.stop_gradient = False
 
     def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
         assert isinstance(optimizer, HybridParallelOptimizer), (
             'optimizer should be HybridParallelOptimizer subclass.')
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        self.scaler = scaler
+        if scaler is not None:
+            assert isinstance(scaler, HybridParallelGradScaler), (
+                'scaler should be HybridParallelGradScaler subclass or None.')
         assert fluid.framework._dygraph_tracer()._has_grad, (
             'Please enable the generation of gradients.')
 
         if self.is_first_stage or self.is_last_stage:
             assert data is not None, (
-                "For the first and the last stage, the data_iter must be set.")
+                "For the first and the last stage, the data must be set.")
         else:
             data = None
 
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.scaler = scaler
         self.data = data
+
         self._layers.train()
 
         # store total loss of entire batch
         self.total_loss = None
-        self._init_caches(self.accumulate_steps)
-        startup_steps = self.num_stages - self.stage_id - 1
-        forward_steps = 0
-        backward_steps = 0
 
-        # forward
-        while (forward_steps < self.accumulate_steps):
-            self._forward(cache_id=forward_steps)
-            forward_steps += 1
+        # store data id for micro_batch
+        self.micro_batch_id = 0
 
-        # backward
-        while (backward_steps < self.accumulate_steps):
-            self._backward(cache_id=backward_steps)
-            backward_steps += 1
+        # Next, use the 1f1b scheduling strategy.
+        # this strategy is inspired by:
+        # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py
 
-        self._layers.allreduce_shared_weight_gradients()
+        startup_steps = (self.num_stages - self.stage_id - 1)
+        startup_steps = min(startup_steps, self.accumulate_steps)
+        steady_steps = self.accumulate_steps - startup_steps
 
-        # optimizer
-        self.train_loss = self._reduce_final_loss()
-        self._step()
-        return self.train_loss
+        input_buffers = []
+        output_buffers = []
 
-    def _forward(self, cache_id):
-        # load data
-        self._load_micro_batch(cache_id)
-        if self.stage_id != 0:
-            self._recv_activations(cache_id)
+        for step_id in range(startup_steps):
+            input_tensor = p2p.recv_forward()
+            self._set_tensor_trainable(input_tensor)
 
-        if isinstance(self.caches['inputs'][cache_id], tuple):
-            inputs = tuple(t for t in self.caches['inputs'][cache_id])
-        else:
-            inputs = self.caches['inputs'][cache_id]
+            output_tensor = self._forward_step(input_tensor)
+            p2p.send_forward(output_tensor)
 
-        self._clear_grads(inputs)
-        outputs = self._layers.forward(inputs)
+            input_buffers.append(input_tensor)
+            output_buffers.append(output_tensor)
 
-        self.caches['outputs'][cache_id] = outputs
+        if steady_steps > 0:
+            input_tensor = p2p.recv_forward()
 
-        if self.is_last_stage:
-            if self._layers._loss_fn is not None:
-                labels = self.caches['labels'][cache_id]
-                outputs = self._layers._loss_fn(outputs, labels)
+        for i in range(steady_steps):
+            last_iter = (i == (steady_steps - 1))
 
-        if self.is_last_stage:
-            self.current_loss = outputs
-            if isinstance(self.current_loss, paddle.Tensor):
-                if self.total_loss is None:
-                    self.total_loss = paddle.zeros_like(self.current_loss)
-                self.total_loss += self.current_loss.detach()
-            else:
-                if self.total_loss is None:
-                    self.total_loss = [
-                        paddle.zeros_like(v) for v in self.current_loss
-                    ]
-                for idx, v in enumerate(self.current_loss):
-                    self.total_loss[idx] += v.detach()
+            self._set_tensor_trainable(input_tensor)
+            output_tensor = self._forward_step(input_tensor)
 
-            if self.accumulate_steps > 1:
-                self.current_loss = self.current_loss / self.accumulate_steps
+            output_tensor_grad = p2p.send_forward_recv_backward(output_tensor)
 
-            self.caches['outputs'][cache_id] = self.current_loss.clone()
+            input_buffers.append(input_tensor)
+            output_buffers.append(output_tensor)
 
-        else:
-            self._send_activations(cache_id)
+            input_tensor, output_tensor = input_buffers.pop(
+                0), output_buffers.pop(0)
 
-    def _backward(self, cache_id):
-        if self.is_last_stage:
-            if self.scaler:
-                paddle.autograd.backward(
-                    self.scaler.scale(self.caches['outputs'][cache_id]))
+            input_tensor_grad = self._backward_step(input_tensor, output_tensor,
+                                                    output_tensor_grad)
+
+            if last_iter:
+                input_tensor = None
+                p2p.send_backward(input_tensor_grad)
             else:
-                paddle.autograd.backward(self.caches['outputs'][cache_id])
+                input_tensor = p2p.send_backward_recv_forward(input_tensor_grad)
 
-            self._send_gradients(cache_id)
-            return
-        self._recv_gradients(cache_id)
+        for i in range(startup_steps):
+            input_tensor = input_buffers.pop(0)
+            output_tensor = output_buffers.pop(0)
 
-        outputs = self.caches['outputs'][cache_id]
+            output_tensor_grad = p2p.recv_backward()
 
-        grad_tensors = self.grad_tensors
-        if isinstance(outputs, tuple):
-            out_tensors = [t for t in outputs if is_float_tensor(t)]
-            assert len(out_tensors) == len(grad_tensors)
-            paddle.autograd.backward(
-                tensors=out_tensors, grad_tensors=grad_tensors)
-        else:
-            paddle.autograd.backward(
-                tensors=[outputs], grad_tensors=[grad_tensors])
+            input_tensor_grad = self._backward_step(input_tensor, output_tensor,
+                                                    output_tensor_grad)
+            p2p.send_backward(input_tensor_grad)
 
-        grad_tensors = None
-        if self.stage_id != 0: self._send_gradients(cache_id)
-        self.caches['outputs'][cache_id] = None
+        self._layers.allreduce_shared_weight_gradients()
 
-    def _broadcast_data(self, data):
-        if isinstance(data, paddle.Tensor):
-            paddle.distributed.broadcast(
-                data,
-                src=self._hcg.get_model_parallel_group_src_rank(),
-                group=self._hcg.get_model_parallel_group())
+        self.train_loss = self._reduce_final_loss()
+
+        # optimizer
+        self._optimizer_step()
+        return self.train_loss
+
+    def _forward_step(self, input_tensor):
+        if self.stage_id == 0:
+            input_tensor = self._load_micro_batch(self.micro_batch_id)
+
+        output_tensor = self._layers.forward(input_tensor)
+
+        if self.is_last_stage:
+            labels = self._load_micro_batch(self.micro_batch_id)
+            output_tensor = self._layers._loss_fn(output_tensor, labels)
+            assert isinstance(
+                output_tensor, paddle.
+                Tensor), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+
+            if self.accumulate_steps > 1:
+                output_tensor = output_tensor / self.accumulate_steps
+
+            if self.total_loss is None:
+                self.total_loss = paddle.zeros_like(output_tensor)
+            self.total_loss += output_tensor.detach()
+
+        self.micro_batch_id += 1
+        return output_tensor
+
+    def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
+        if self.is_last_stage:
+            assert output_tensor_grad is None
+            if self.scaler:
+                paddle.autograd.backward(self.scaler.scale(output_tensor))
+            else:
+                paddle.autograd.backward(output_tensor)
         else:
-            for d in data:
-                assert isinstance(d, paddle.Tensor)
-                paddle.distributed.broadcast(
-                    d,
-                    src=self._hcg.get_model_parallel_group_src_rank(),
-                    group=self._hcg.get_model_parallel_group())
-        return data
+            if isinstance(output_tensor, tuple):
+                outputs = [t for t in output_tensor if not t.stop_gradient]
+                assert len(outputs) == len(output_tensor_grad)
+                paddle.autograd.backward(
+                    tensors=outputs,
+                    grad_tensors=[t for t in output_tensor_grad])
+            else:
+                paddle.autograd.backward(
+                    tensors=[output_tensor], grad_tensors=[output_tensor_grad])
+
+        input_tensor_grad = None
+        if input_tensor is not None:
+            if isinstance(input_tensor, tuple):
+                input_tensor_grad = tuple(
+                    [t.grad for t in input_tensor if not t.stop_gradient])
+            else:
+                input_tensor_grad = input_tensor.grad
+        return input_tensor_grad
 
     def _load_micro_batch(self, cache_id):
         inputs = self.data
@@ -246,8 +226,6 @@ def _load_micro_batch(self, cache_id):
 
         if self.is_first_stage:
             assert len(inputs) == 2, "length of input should be 2"
-            if self.use_model_parallel:
-                inputs[0] = self._broadcast_data(inputs[0])
             if isinstance(inputs[0], tuple):
                 batch_size = inputs[0][0].shape[0]
                 assert self.micro_batch_size * self.accumulate_steps == batch_size, (
@@ -255,332 +233,51 @@ def _load_micro_batch(self, cache_id):
                     "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d."
                     %
                     (batch_size, self.micro_batch_size, self.accumulate_steps))
-                data = [
-                    input[begin:end, :].clone().detach() for input in inputs[0]
-                ]
-                self.caches['inputs'][cache_id] = tuple(data)
+                data = [input[begin:end, :].detach() for input in inputs[0]]
+                return tuple(data)
             else:
                 batch_size = inputs[0].shape[0]
                 assert self.micro_batch_size * self.accumulate_steps == batch_size
-                self.caches['inputs'][cache_id] = inputs[0][begin:end, :].clone(
-                ).detach()
+                return inputs[0][begin:end, :].detach()
         elif self.is_last_stage:
             assert len(inputs) == 2, "length of input should be 2"
-            if self.use_model_parallel:
-                inputs[1] = self._broadcast_data(inputs[1])
             if isinstance(inputs[1], tuple):
                 batch_size = inputs[1][0].shape[0]
                 assert self.micro_batch_size * self.accumulate_steps == batch_size
-                data = [
-                    input[begin:end, :].clone().detach() for input in inputs[1]
-                ]
-                self.caches['labels'][cache_id] = tuple(data)
+                data = [input[begin:end, :].detach() for input in inputs[1]]
+                return tuple(data)
             else:
                 batch_size = inputs[1].shape[0]
                 assert self.micro_batch_size * self.accumulate_steps == batch_size
-                self.caches['labels'][cache_id] = inputs[1][begin:end, :].clone(
-                ).detach()
+                return inputs[1][begin:end, :].detach()
         else:
             # No data input is required for other stages
             inputs = None
 
-    def _send_meta(self, data, peer):
-        if isinstance(data, paddle.Tensor):
-            tensor_type = paddle.to_tensor([0])
-            # send tensor type
-            p2p.send(tensor_type, self.next_stage_id)
-
-            # send len(shape)
-            dims = paddle.to_tensor(len(data.shape))
-            p2p.send(dims, self.next_stage_id)
-
-            # send shape
-            shape = paddle.to_tensor(data.shape)
-            p2p.send(shape, self.next_stage_id)
-
-            # send dtype
-            dtype = paddle.to_tensor(paddle_2_number(data.dtype))
-            p2p.send(dtype, self.next_stage_id)
-
-        elif isinstance(data, tuple):
-            tensor_type = paddle.to_tensor([1])
-            p2p.send(tensor_type, self.next_stage_id)
-
-            nums = paddle.to_tensor(len(data))
-            p2p.send(nums, self.next_stage_id)
-
-            for idx, d in enumerate(data):
-                assert isinstance(d, paddle.Tensor)
-                # send len(shape)
-                dims = paddle.to_tensor(len(d.shape))
-                p2p.send(dims, self.next_stage_id)
-
-                # send shape
-                shape = paddle.to_tensor(d.shape)
-                p2p.send(shape, self.next_stage_id)
-
-                # send dtype
-                dtype = paddle.to_tensor(paddle_2_number(d.dtype))
-                p2p.send(dtype, self.next_stage_id)
-
-    def _recv_meta(self, peer):
-        tensor_type = paddle.to_tensor([0])
-        p2p.recv(tensor_type, self.prev_stage_id)
-
-        tensor_type = tensor_type.item()
-
-        if tensor_type == 0:
-            # recv len(shape)
-            dims = paddle.to_tensor([0])
-            p2p.recv(dims, self.prev_stage_id)
-
-            dims = dims.item()
-
-            # recv shape
-            shape = paddle.to_tensor([0] * dims)
-            p2p.recv(shape, self.prev_stage_id)
-
-            shape = shape.numpy().tolist()
-
-            # recv dtype
-            dtype = paddle.to_tensor([0])
-            p2p.recv(dtype, self.prev_stage_id)
-
-            return self._allocate_cache(
-                shape, dtype=number_2_dtype(dtype.item()), num_caches=1)[0]
-        elif tensor_type == 1:
-            num = paddle.to_tensor([0])
-            p2p.recv(num, self.prev_stage_id)
-            num = num.item()
-            shapes = []
-            dtypes = []
-            for i in range(num):
-                # recv len(shape)
-                dims = paddle.to_tensor([0])
-                p2p.recv(dims, self.prev_stage_id)
-
-                # recv shape
-                dims = dims.item()
-                shape = paddle.to_tensor([0] * dims)
-                p2p.recv(shape, self.prev_stage_id)
-                shapes.append(shape.numpy().tolist())
-
-                # recv dtype
-                dtype = paddle.to_tensor([0])
-                p2p.recv(dtype, self.prev_stage_id)
-                dtypes.append(number_2_dtype(dtype.item()))
-
-            caches = self._allocate_caches(shapes, dtypes, num_caches=1)[0]
-            caches = tuple(caches)
-            return caches
-
-    def _is_valid_send_recv(self, tensor):
-        tensor_numel = np.prod(tensor.shape)
-        assert tensor_numel != 0, "can't send/recv zero element"
-        return tensor_numel % self.mp_degree == 0
-
-    def _send_activations(self, cache_id):
-        outputs = self.caches['outputs'][cache_id]
-
-        if self.send_meta:
-            self.send_meta = False
-            self._send_meta(outputs, self.next_stage_id)
-
-        if isinstance(outputs, paddle.Tensor):
-            if self.is_pipe_partitioned and self._is_valid_send_recv(outputs):
-                p2p.send_partial(
-                    outputs.detach(),
-                    self.next_stage_id,
-                    mp_degree=self.mp_degree,
-                    mp_rank=self.mp_rank)
-            else:
-                p2p.send(outputs.detach(), self.next_stage_id)
-
-        elif isinstance(outputs, tuple):
-            for output in outputs:
-                if self.is_pipe_partitioned and self._is_valid_send_recv(
-                        output):
-                    p2p.send_partial(
-                        output.detach(),
-                        self.next_stage_id,
-                        mp_degree=self.mp_degree,
-                        mp_rank=self.mp_rank)
-                else:
-                    p2p.send(output.detach(), self.next_stage_id)
-
-    def _send_gradients(self, cache_id):
-        inputs = self.caches['inputs'][cache_id]
-        if isinstance(inputs, paddle.Tensor):
-            assert inputs.grad is not None
-            if self.is_pipe_partitioned and self._is_valid_send_recv(
-                    inputs.grad):
-                grad = p2p.send_partial(
-                    inputs.grad,
-                    self.prev_stage_id,
-                    mp_degree=self.mp_degree,
-                    mp_rank=self.mp_rank)
-            else:
-                p2p.send(inputs.grad, self.prev_stage_id)
-        else:
-            for idx, d in enumerate(inputs):
-                # Skip tensors that will not produce a grad
-                if not is_float_tensor(d):
-                    assert d.grad is None
-                    continue
-
-                if self.is_pipe_partitioned and self._is_valid_send_recv(
-                        d.grad):
-                    grad = p2p.send_partial(
-                        d.grad,
-                        self.prev_stage_id,
-                        mp_degree=self.mp_degree,
-                        mp_rank=self.mp_rank)
-                else:
-                    p2p.send(d.grad, self.prev_stage_id)
-
-        self.caches['inputs'][cache_id] = None
-
-    def _recv_activations(self, cache_id):
-        inputs = None
-        if self.recv_cache is None:
-            self.recv_cache = self._recv_meta(self.prev_stage_id)
-
-        if isinstance(self.recv_cache, paddle.Tensor):
-            if self.is_pipe_partitioned and self._is_valid_send_recv(
-                    self.recv_cache):
-                p2p.recv_partial(self.recv_cache, self.prev_stage_id,
-                                 self.mp_degree, self.mp_rank)
-                p2p.partial_allgather_operator(
-                    self.recv_cache,
-                    mp_ranks=self.mp_degree,
-                    mp_rank_id=self.mp_rank,
-                    group=self._hcg.get_model_parallel_group(),
-                    use_calc_stream=True)
-            else:
-                p2p.recv(self.recv_cache, self.prev_stage_id)
-
-            inputs = self.recv_cache.clone().detach()
-            inputs.stop_gradient = not is_float_tensor(inputs)
-
+    def _reduce_final_loss(self):
+        if self.is_last_stage:
+            assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss"
+            loss = self.total_loss.detach()
+            paddle.distributed.broadcast(
+                loss,
+                src=self.global_rank,
+                use_calc_stream=True,
+                group=self.pp_group)
         else:
-            assert isinstance(self.recv_cache, tuple)
-            inputs = [None] * len(self.recv_cache)
-            for idx, d in enumerate(self.recv_cache):
-                if self.is_pipe_partitioned and self._is_valid_send_recv(d):
-                    assert isinstance(d, paddle.Tensor)
-                    p2p.recv_partial(d, self.prev_stage_id, self.mp_degree,
-                                     self.mp_rank)
-                    p2p.partial_allgather_operator(
-                        d,
-                        mp_ranks=self.mp_degree,
-                        mp_rank_id=self.mp_rank,
-                        group=self._hcg.get_model_parallel_group(),
-                        use_calc_stream=True)
-                else:
-                    assert isinstance(d, paddle.Tensor)
-                    p2p.recv(d, self.prev_stage_id)
-                inputs[idx] = d.clone().detach()
-
-            inputs = tuple(inputs)
-
-            for d in inputs:
-                d.stop_gradient = not is_float_tensor(d)
-
-        self.caches['inputs'][cache_id] = inputs
-
-    def _recv_gradients(self, cache_id):
-        outputs = self.caches['outputs'][cache_id]
-        if self.grad_tensors is None:
-            if isinstance(outputs, paddle.Tensor):
-                s = list(outputs.shape)
-                dtype = get_tensor_dtype(outputs.dtype)
-                self.grad_tensors = self._allocate_cache(
-                    s, dtype, num_caches=1)[0]
-            else:
-                sizes = [list(d.shape) for d in outputs if is_float_tensor(d)]
-                dtypes = [
-                    get_tensor_dtype(d.dtype) for d in outputs
-                    if is_float_tensor(d)
-                ]
-                self.grad_tensors = self._allocate_caches(
-                    sizes, dtypes, num_caches=1)[0]
-
-        if isinstance(self.grad_tensors, paddle.Tensor):
-            if self.is_pipe_partitioned and self._is_valid_send_recv(
-                    self.grad_tensors):
-                p2p.recv_partial(self.grad_tensors, self.next_stage_id,
-                                 self.mp_degree, self.mp_rank)
-                p2p.partial_allgather_operator(
-                    self.grad_tensors,
-                    mp_ranks=self.mp_degree,
-                    mp_rank_id=self.mp_rank,
-                    group=self._hcg.get_model_parallel_group(),
-                    use_calc_stream=True)
-            else:
-                p2p.recv(self.grad_tensors, self.next_stage_id)
+            loss = paddle.zeros(shape=[1], dtype="float32")
+            paddle.distributed.broadcast(
+                loss,
+                src=self._hcg.get_rank_from_stage(self.num_stages - 1),
+                use_calc_stream=True,
+                group=self.pp_group)
+        return loss
 
-        else:
-            assert isinstance(outputs, tuple)
-            for d in self.grad_tensors:
-                if self.is_pipe_partitioned and self._is_valid_send_recv(d):
-                    p2p.recv_partial(d, self.next_stage_id, self.mp_degree,
-                                     self.mp_rank)
-                    p2p.partial_allgather_operator(
-                        d,
-                        mp_ranks=self.mp_degree,
-                        mp_rank_id=self.mp_rank,
-                        group=self._hcg.get_model_parallel_group(),
-                        use_calc_stream=True)
-                else:
-                    p2p.recv(d, self.next_stage_id)
-
-    def _step(self):
+    def _optimizer_step(self):
         if self.scaler:
             self.scaler.minimize(self.optimizer, self.train_loss)
         else:
             self.optimizer.step()
+
         self.optimizer.clear_grad()
         if self.lr_scheduler:
             self.lr_scheduler.step()
-
-    def _clear_grads(self, inputs):
-        if isinstance(inputs, paddle.Tensor):
-            if inputs.grad is not None:
-                inputs.clear_gradient()
-        else:
-            for d in inputs:
-                if d.grad is not None:
-                    d.clear_gradient()
-
-    def _allocate_zeros(self, shape, dtype):
-        return paddle.zeros(shape, dtype)
-
-    def _allocate_cache(self, shape, dtype, num_caches=-1):
-        caches = []
-        if num_caches == -1:
-            num_caches = self.num_caches
-        for count in range(num_caches):
-            caches.append(self._allocate_zeros(shape, dtype))
-        return caches
-
-    def _allocate_caches(self, shapes, dtypes, num_caches=-1):
-        caches = []
-        if num_caches == -1:
-            num_caches = self.num_caches
-        for count in range(num_caches):
-            cache = []
-            for shape, dtype in zip(shapes, dtypes):
-                cache.append(self._allocate_zeros(shape, dtype))
-            caches.append(cache)
-        return caches
-
-    def save_state_dict(self, model_path):
-        state_dict = self._layers.state_dict()
-        paddle.save(state_dict, model_path)
-
-    def load_state_dict(self, model_path):
-        state_dict = paddle.load(self.model_path)
-        self._layers.set_state_dict(state_dict)
-
-    def forward(self, *inputs, **kwargs):
-        raise RuntimeError("Call train_batch for pipeline instead of forward.")
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 44090be94f1a7..e533b2ef3f7a3 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -13,131 +13,388 @@
 # limitations under the License.
 
 import paddle
+from .utils import paddle_2_number, number_2_dtype
+from ...utils.log_util import logger
 
-_groups = None
 _hcg = None
 
 
 def initialize_p2p_groups(hcg):
-    global _groups, _hcg
-    _groups = [
-        paddle.distributed.new_group(ranks=group)
-        for group in hcg.get_p2p_groups()
-    ]
+    global _hcg
     _hcg = hcg
+    send_next_group, send_prev_group, recv_next_group, recv_prev_group = _hcg.get_p2p_groups(
+    )
 
+    debug_str = "P2pInfo: send_next_group: %s, send_prev_group: %s, " \
+                    "recv_next_group: %s, recv_prev_group: %s" % (repr(send_next_group),
+                    repr(send_prev_group),repr(recv_next_group), repr(recv_prev_group))
+    logger.info(debug_str)
 
-def _is_valid_communciate(src_stage, dest_stage):
-    first_stage = 0
-    last_stage = _hcg.get_pipe_parallel_world_size() - 1
-    assert abs(src_stage-dest_stage) == 1 or \
-        (src_stage == first_stage and dest_stage == last_stage) or \
-        (src_stage == last_stage and dest_stage == first_stage)
 
+class SendRecvMeta:
+    """Mainly used to help p2p communication context information"""
 
-def partial_send_operator(tensor,
-                          dst=0,
-                          mp_ranks=1,
-                          mp_rank_id=0,
-                          group=None,
-                          use_calc_stream=True):
+    def __init__(self):
+        self.send_shape_message = None
+        self.send_dtype_message = None
 
+        self.recv_shape_message = None
+        self.recv_dtype_message = None
+
+        self.has_send_meta = False
+        self.has_recv_meta = False
+
+    def _recv_shape_dtype(self, group):
+        # recv len(shape)
+        dims = paddle.to_tensor([0])
+        paddle.distributed.recv(dims, src=0, group=group)
+        dims = dims.item()
+
+        # recv shape
+        shape = paddle.to_tensor([0] * dims)
+        paddle.distributed.recv(shape, src=0, group=group)
+
+        # recv dtype
+        dtype = paddle.to_tensor([0])
+        paddle.distributed.recv(dtype, src=0, group=group)
+        return shape.numpy().tolist(), dtype.item()
+
+    def recv_meta(self, group):
+        tensor_type = paddle.to_tensor([0])
+        paddle.distributed.recv(tensor_type, src=0, group=group)
+        tensor_type = tensor_type.item()
+
+        if tensor_type == 0:
+            shape, dtype = self._recv_shape_dtype(group)
+            self.recv_shape_message = shape
+            self.recv_dtype_message = dtype
+
+        elif tensor_type == 1:
+            num = paddle.to_tensor([0])
+            paddle.distributed.recv(num, src=0, group=group)
+            num = num.item()
+            shapes = []
+            dtypes = []
+            for i in range(num):
+                shape, dtype = self._recv_shape_dtype(group)
+                shapes.append(shape)
+                dtypes.append(dtype)
+
+            self.recv_shape_message = tuple(shapes)
+            self.recv_dtype_message = tuple(dtypes)
+
+    def _send_dims_shape_dtype(self, tensor, group):
+        # send len(shape)
+        dims = paddle.to_tensor(len(tensor.shape))
+        paddle.distributed.send(dims, dst=1, group=group)
+
+        # send shape
+        shape = paddle.to_tensor(tensor.shape)
+        paddle.distributed.send(shape, dst=1, group=group)
+
+        # send dtype
+        dtype = paddle.to_tensor(paddle_2_number(tensor.dtype))
+        paddle.distributed.send(dtype, dst=1, group=group)
+
+    def send_meta(self, tensor, group):
+        if isinstance(tensor, paddle.Tensor):
+            tensor_type = paddle.to_tensor([0])
+            # send tensor type
+            paddle.distributed.send(tensor_type, dst=1, group=group)
+
+            self._send_dims_shape_dtype(tensor, group)
+        elif isinstance(tensor, tuple):
+            tensor_type = paddle.to_tensor([1])
+            # send tensor type
+            paddle.distributed.send(tensor_type, dst=1, group=group)
+
+            nums = paddle.to_tensor(len(tensor))
+            paddle.distributed.send(nums, dst=1, group=group)
+
+            for d in tensor:
+                assert isinstance(d, paddle.Tensor)
+                self._send_dims_shape_dtype(d, group=group)
+
+    def set_send_message(self, tensor):
+        if isinstance(tensor, paddle.Tensor):
+            self.send_shape_message = tensor.shape
+            self.send_dtype_message = paddle_2_number(tensor.dtype)
+        elif isinstance(tensor, tuple):
+            self.send_shape_message = tuple(
+                [d.shape for d in tensor if not d.stop_gradient])
+            self.send_dtype_message = tuple(
+                [paddle_2_number(d.dtype) for d in tensor])
+
+
+_send_recv_meta = SendRecvMeta()
+
+
+def send_partial(tensor,
+                 dst=0,
+                 nranks=1,
+                 rank_id=0,
+                 group=None,
+                 use_calc_stream=True):
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
     return paddle.fluid.core.ops.partial_send(
         tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer',
-        dst, 'num', mp_ranks, 'id', mp_rank_id)
+        dst, 'num', nranks, 'id', rank_id)
 
 
-def partial_recv_operator(tensor,
-                          src=0,
-                          mp_ranks=1,
-                          mp_rank_id=0,
-                          group=None,
-                          use_calc_stream=True):
-
+def recv_partial(tensor,
+                 src=0,
+                 nranks=1,
+                 rank_id=0,
+                 group=None,
+                 use_calc_stream=True):
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
 
-    return paddle.fluid.core.ops.partial_recv(
+    paddle.fluid.core.ops.partial_recv(
         tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer',
-        src, 'num', mp_ranks, 'id', mp_rank_id, 'dtype', tensor.dtype,
-        'out_shape', tensor.shape)
+        src, 'num', nranks, 'id', rank_id, 'dtype', tensor.dtype, 'out_shape',
+        tensor.shape)
 
 
-def partial_allgather_operator(tensor,
-                               mp_ranks=1,
-                               mp_rank_id=0,
-                               group=None,
-                               use_calc_stream=True):
+def allgather_partial(tensor,
+                      nranks=1,
+                      rank_id=0,
+                      group=None,
+                      use_calc_stream=True):
+    if nranks == 1:
+        return tensor
     if group is not None and not group.is_member():
         return
     ring_id = 0 if group is None else group.id
 
     return paddle.fluid.core.ops.partial_allgather_(
         tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
-        'nranks', mp_ranks, 'rank', mp_rank_id)
-
-
-def send(tensor, dest_stage):
-    global _groups, _hcg
-    src_stage = _hcg.get_stage_id()
-    _is_valid_communciate(src_stage, dest_stage)
-    group = _get_send_recv_group(src_stage, dest_stage)
-    return paddle.distributed.send(
-        tensor, dst=1 if dest_stage > src_stage else 0, group=group)
-
-
-def recv(tensor, src_stage):
-    global _groups, _hcg
-    dest_stage = _hcg.get_stage_id()
-
-    _is_valid_communciate(src_stage, dest_stage)
-    group = _get_send_recv_group(src_stage, dest_stage)
-    return paddle.distributed.recv(
-        tensor, src=0 if dest_stage > src_stage else 1, group=group)
-
-
-def send_partial(tensor, dest_stage, mp_degree, mp_rank):
-    global _groups, _hcg
-    src_stage = _hcg.get_stage_id()
-    _is_valid_communciate(src_stage, dest_stage)
-    group = _get_send_recv_group(src_stage, dest_stage)
-    return partial_send_operator(
-        tensor,
-        dst=1 if dest_stage > src_stage else 0,
-        mp_ranks=mp_degree,
-        mp_rank_id=mp_rank,
-        group=group)
-
-
-def recv_partial(tensor, src_stage, mp_degree, mp_rank):
-    global _groups, _hcg
-    dest_stage = _hcg.get_stage_id()
-
-    _is_valid_communciate(src_stage, dest_stage)
-    group = _get_send_recv_group(src_stage, dest_stage)
-    return partial_recv_operator(
-        tensor,
-        src=0 if dest_stage > src_stage else 1,
-        mp_ranks=mp_degree,
-        mp_rank_id=mp_rank,
-        group=group)
-
-
-def _get_send_recv_group(src_stage, dest_stage):
-    global _groups, _hcg
-    stage_id = None
-    first_stage = 0
-    last_stage = _hcg.get_pipe_parallel_world_size() - 1
-    if (src_stage == first_stage and dest_stage == last_stage) or \
-            (dest_stage == first_stage and src_stage == last_stage):
-        stage_id = last_stage
-    elif src_stage > dest_stage:
-        stage_id = dest_stage
+        'nranks', nranks, 'rank', rank_id)
+
+
+def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
+    global _hcg
+
+    tensor_recv_prev = None
+    tensor_recv_next = None
+
+    # send / recv message
+    recv_shape_msg = _send_recv_meta.recv_shape_message
+    recv_dtype_msg = _send_recv_meta.recv_dtype_message
+    send_shape_msg = _send_recv_meta.send_shape_message
+    send_dtype_msg = _send_recv_meta.send_dtype_message
+
+    # model parallel message
+    mp_group = _hcg.get_model_parallel_group()
+    mp_degree = _hcg.get_model_parallel_world_size()
+    mp_rank = _hcg.get_model_parallel_rank()
+
+    if recv_prev:
+        if isinstance(recv_shape_msg, tuple):
+            tensor_recv_prev = []
+            for idx, shape in enumerate(recv_shape_msg):
+                tensor_recv_prev.append(
+                    paddle.empty(
+                        shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx])))
+            tensor_recv_prev = tuple(tensor_recv_prev)
+        else:
+            tensor_recv_prev = paddle.empty(
+                shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg))
+
+    if recv_next:
+        if isinstance(send_shape_msg, tuple):
+            tensor_recv_next = []
+            for idx, shape in enumerate(send_shape_msg):
+                tensor_recv_next.append(
+                    paddle.empty(
+                        shape=shape, dtype=number_2_dtype(send_dtype_msg[idx])))
+            tensor_recv_next = tuple(tensor_recv_next)
+        else:
+            tensor_recv_next = paddle.empty(
+                shape=send_shape_msg, dtype=number_2_dtype(send_dtype_msg))
+
+    # start to p2p communicate
+    if tensor_send_prev is not None:
+        if isinstance(tensor_send_prev, tuple):
+            for d in tensor_send_prev:
+                paddle.distributed.wait(d, use_calc_stream=True)
+                send_partial(
+                    d,
+                    dst=0,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=_hcg.send_prev_group,
+                    use_calc_stream=False)
+        else:
+            paddle.distributed.wait(tensor_send_prev, use_calc_stream=True)
+            send_partial(
+                tensor_send_prev,
+                dst=0,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=_hcg.send_prev_group,
+                use_calc_stream=False)
+
+    if tensor_recv_prev is not None:
+        if isinstance(tensor_recv_prev, tuple):
+            for d in tensor_recv_prev:
+                recv_partial(
+                    d,
+                    src=0,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=_hcg.recv_prev_group,
+                    use_calc_stream=True)
+                allgather_partial(
+                    d,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=mp_group,
+                    use_calc_stream=True)
+        else:
+            recv_partial(
+                tensor_recv_prev,
+                src=0,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=_hcg.recv_prev_group,
+                use_calc_stream=True)
+            allgather_partial(
+                tensor_recv_prev,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=mp_group,
+                use_calc_stream=True)
+
+    if tensor_send_next is not None:
+        if isinstance(tensor_send_next, tuple):
+            for d in tensor_send_next:
+                paddle.distributed.wait(d, use_calc_stream=True)
+                send_partial(
+                    d,
+                    dst=1,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=_hcg.send_next_group,
+                    use_calc_stream=False)
+        else:
+            paddle.distributed.wait(tensor_send_next, use_calc_stream=True)
+            send_partial(
+                tensor_send_next,
+                dst=1,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=_hcg.send_next_group,
+                use_calc_stream=False)
+
+    if tensor_recv_next is not None:
+        if isinstance(tensor_recv_next, tuple):
+            for d in tensor_recv_next:
+                recv_partial(
+                    d,
+                    src=1,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=_hcg.recv_next_group,
+                    use_calc_stream=True)
+                allgather_partial(
+                    d,
+                    nranks=mp_degree,
+                    rank_id=mp_rank,
+                    group=mp_group,
+                    use_calc_stream=True)
+
+        else:
+            recv_partial(
+                tensor_recv_next,
+                src=1,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=_hcg.recv_next_group,
+                use_calc_stream=True)
+
+            allgather_partial(
+                tensor_recv_next,
+                nranks=mp_degree,
+                rank_id=mp_rank,
+                group=mp_group,
+                use_calc_stream=True)
+    return tensor_recv_prev, tensor_recv_next
+
+
+def recv_forward():
+    if _hcg.is_first_stage:
+        input_tensor = None
+    else:
+        if not _send_recv_meta.has_recv_meta:
+            _send_recv_meta.recv_meta(_hcg.recv_prev_group)
+            _send_recv_meta.has_recv_meta = True
+
+        input_tensor, _ = _p2p_helper(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=True,
+            recv_next=False)
+    return input_tensor
+
+
+def recv_backward():
+    if _hcg.is_last_stage:
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _p2p_helper(
+            tensor_send_next=None,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True)
+    return output_tensor_grad
+
+
+def send_forward(output_tensor):
+    if not _hcg.is_last_stage:
+        if not _send_recv_meta.has_send_meta:
+            _send_recv_meta.set_send_message(output_tensor)
+            _send_recv_meta.send_meta(output_tensor, _hcg.send_next_group)
+            _send_recv_meta.has_send_meta = True
+
+        _p2p_helper(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=False)
+
+
+def send_backward(input_tensor_grad):
+    if not _hcg.is_first_stage:
+        _p2p_helper(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=False,
+            recv_next=False)
+
+
+def send_forward_recv_backward(output_tensor):
+    if _hcg.is_last_stage:
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _p2p_helper(
+            tensor_send_next=output_tensor,
+            tensor_send_prev=None,
+            recv_prev=False,
+            recv_next=True)
+    return output_tensor_grad
+
+
+def send_backward_recv_forward(input_tensor_grad):
+    if _hcg.is_first_stage:
+        input_tensor = None
     else:
-        stage_id = src_stage
-    group_id = _hcg.get_rank_from_stage(stage_id=stage_id)
-    return _groups[group_id]
+        input_tensor, _ = _p2p_helper(
+            tensor_send_next=None,
+            tensor_send_prev=input_tensor_grad,
+            recv_prev=True,
+            recv_next=False)
+    return input_tensor
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5e644fefa3ffb..2247d49483035 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -3232,6 +3232,22 @@ def _clone_variable(self, var, force_persistable=True):
         return ret_var
 
 
+def _apply_pass(main_program,
+                startup_program,
+                pass_name,
+                pass_attrs={},
+                pass_attr_types={}):
+    assert isinstance(pass_attrs, dict), "pass_attrs must be dict"
+    assert isinstance(pass_attr_types, dict), "pass_attr_types must be dict"
+    tmp_main_program = core.ProgramDesc(main_program.desc)
+    tmp_startup_program = core.ProgramDesc(startup_program.desc)
+    attrs = core.apply_pass(tmp_main_program, tmp_startup_program, pass_name,
+                            pass_attrs, pass_attr_types)
+    main_program._rebuild_from_desc(tmp_main_program)
+    startup_program._rebuild_from_desc(tmp_startup_program)
+    return attrs
+
+
 class IrNode(object):
     """
     Python IrNode. Beneath it is a core.Node, which is used for Ir Pass.
@@ -4148,6 +4164,91 @@ def __init__(self):
         # compiled program, i.e. Graph
         self._graph = None
 
+    def _find_var_class_kwargs(self, new_desc):
+        old_desc = self.desc
+        all_new_vars = []
+        block_num = new_desc.num_blocks()
+        for idx in range(block_num):
+            new_block_desc = new_desc.block(idx)
+            all_new_vars.append([])
+            block_new_vars = all_new_vars[-1]
+            for new_var_desc in new_block_desc.all_vars():
+                if self.blocks[idx].has_var(new_var_desc.name()):
+                    old_var = self.blocks[idx].var(new_var_desc.name())
+                else:
+                    old_var = None
+
+                kwargs = {
+                    'type': new_var_desc.type(),
+                    'name': new_var_desc.name(),
+                    'shape': new_var_desc.shape(),
+                    'dtype': new_var_desc.dtype(),
+                    'lod_level': new_var_desc.lod_level(),
+                    'error_clip': old_var.error_clip
+                    if old_var is not None else None,
+                    'stop_gradient': old_var.stop_gradient
+                    if old_var is not None else False,
+                    'is_data': old_var.is_data
+                    if old_var is not None else False,
+                    'need_check_feed': new_var_desc.need_check_feed(),
+                    'belong_to_optimizer': old_var.belong_to_optimizer
+                    if old_var is not None else False,
+                }
+
+                if isinstance(old_var, Parameter):
+                    kwargs.update({
+                        'trainable': old_var.trainable,
+                        'optimize_attr': old_var.optimize_attr,
+                        'regularizer': old_var.regularizer,
+                        'do_model_average': old_var.do_model_average,
+                        'need_clip': old_var.need_clip,
+                        'is_distributed': old_var.is_distributed,
+                        'is_parameter': old_var.is_parameter,
+                    })
+                    block_new_vars.append({
+                        'class': Parameter,
+                        'kwargs': copy.deepcopy(kwargs),
+                    })
+                else:
+                    kwargs['persistable'] = new_var_desc.persistable()
+                    block_new_vars.append({
+                        'class': Variable,
+                        'kwargs': copy.deepcopy(kwargs),
+                    })
+
+        return all_new_vars
+
+    def _rebuild_from_desc(self, desc):
+        all_new_vars = self._find_var_class_kwargs(desc)
+        block_num = desc.num_blocks()
+        assert block_num == len(all_new_vars)
+
+        # clear old blocks and desc
+        self.blocks = []
+        self.desc = None
+
+        # create new blocks and set desc
+        self.desc = desc
+        self.blocks = [Block(self, idx) for idx in range(block_num)]
+
+        # add new vars first
+        for idx in range(block_num):
+            block = self.blocks[idx]
+            for new_var in all_new_vars[idx]:
+                clazz = new_var['class']
+                kwargs = new_var['kwargs']
+                kwargs['block'] = block
+                clazz(**kwargs)
+
+        # then append op
+        for idx in range(block_num):
+            block = self.blocks[idx]
+            block_desc = self.desc.block(idx)
+            for op_idx in range(block_desc.op_size()):
+                op_desc = block_desc.op(op_idx)
+                op = Operator(block=block, desc=op_desc)
+                block.ops.append(op)
+
     def global_seed(self, seed=0):
         """
         Set global seed for Program
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
index 7b4a35a6a7898..f486cbc27dca5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
@@ -18,6 +18,7 @@
 import numpy as np
 
 import paddle
+from paddle.static import InputSpec
 
 SEED = 2020
 np.random.seed(SEED)
@@ -176,6 +177,46 @@ def test_set_value_with_save(self):
             output_spec=None)
 
 
+class TestSliceSupplementSpecialCase(unittest.TestCase):
+    # unittest for slice index which abs(step)>0. eg: x[::2]
+    def test_static_slice_step(self):
+        paddle.enable_static()
+        array = np.arange(4**3).reshape((4, 4, 4)).astype('int64')
+
+        x = paddle.static.data(name='x', shape=[4, 4, 4], dtype='int64')
+        z1 = x[::2]
+        z2 = x[::-2]
+
+        place = paddle.CPUPlace()
+        prog = paddle.static.default_main_program()
+        exe = paddle.static.Executor(place)
+        exe.run(paddle.static.default_startup_program())
+
+        out = exe.run(prog, feed={'x': array}, fetch_list=[z1, z2])
+
+        self.assertTrue(np.array_equal(out[0], array[::2]))
+        self.assertTrue(np.array_equal(out[1], array[::-2]))
+
+    def test_static_slice_step_dygraph2static(self):
+        paddle.disable_static()
+
+        array = np.arange(4**2 * 5).reshape((5, 4, 4)).astype('int64')
+        inps = paddle.to_tensor(array)
+
+        def func(inps):
+            return inps[::2], inps[::-2]
+
+        origin_result = func(inps)
+        sfunc = paddle.jit.to_static(
+            func, input_spec=[InputSpec(shape=[None, 4, 4])])
+        static_result = sfunc(inps)
+
+        self.assertTrue(
+            np.array_equal(origin_result[0].numpy(), static_result[0].numpy()))
+        self.assertTrue(
+            np.array_equal(origin_result[1].numpy(), static_result[1].numpy()))
+
+
 class TestPaddleStridedSlice(unittest.TestCase):
     def test_compare_paddle_strided_slice_with_numpy(self):
         paddle.disable_static()
@@ -202,6 +243,20 @@ def test_compare_paddle_strided_slice_with_numpy(self):
             np.array_equal(sl.numpy(), array[s2[0]:e2[0]:stride2[0], s2[1]:e2[
                 1]:stride2[1]]))
 
+        array = np.arange(6 * 7 * 8).reshape((6, 7, 8))
+        pt = paddle.to_tensor(array)
+        s2 = [7, -1]
+        e2 = [2, -5]
+        stride2 = [-2, -3]
+        sl = paddle.strided_slice(
+            pt, axes=[0, 2], starts=s2, ends=e2, strides=stride2)
+
+        array_slice = array[s2[0]:e2[0]:stride2[0], ::, s2[1]:e2[1]:stride2[1]]
+        self.assertTrue(
+            np.array_equal(sl.numpy(), array_slice),
+            msg="paddle.strided_slice:\n {} \n numpy slice:\n{}".format(
+                sl.numpy(), array_slice))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
new file mode 100644
index 0000000000000..b336330836a66
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from paddle.fluid import layers
+import paddle.nn.functional as F
+from paddle.distributed.fleet.meta_parallel import PipelineLayer, LayerDesc
+from paddle.fluid.dygraph.layers import Layer
+import paddle.nn as nn
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + dp_id)
+
+
+batch_size = 8
+length = 8
+micro_batch_size = 2
+vocab_size = 128
+hidden_size = 16
+d_model = hidden_size
+dim_feedforward = 4 * d_model
+
+
+class EmbeddingNet(Layer):
+    def __init__(self):
+        super(EmbeddingNet, self).__init__()
+        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
+        self.position_embeddings = nn.Embedding(vocab_size, hidden_size)
+
+    def forward(self, x):
+        attention_mask = paddle.tensor.triu(
+            (paddle.ones(
+                (length, length), dtype="float32") * -1e9), 1)
+        attention_mask.stop_gradient = True
+        w_emb = self.word_embeddings(x)
+        p_emb = self.position_embeddings(x)
+        w_emb = w_emb + p_emb
+
+        # need to fix bug of backward()
+        return w_emb, attention_mask
+
+
+class TransformerNet(Layer):
+    def __init__(self):
+        super(TransformerNet, self).__init__()
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.q_proj = nn.Linear(d_model, d_model)
+        self.k_proj = nn.Linear(d_model, d_model)
+        self.v_proj = nn.Linear(d_model, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5)
+
+    def forward(self, x, mask):
+        q = self.q_proj(x)
+        k = self.k_proj(x)
+        v = self.v_proj(x)
+        product = layers.matmul(x=q, y=k, transpose_y=True, alpha=d_model**-0.5)
+
+        weights = F.softmax(product + mask)
+        weights = F.dropout(weights, 0.2)
+        tgt = layers.matmul(weights, v)
+        residual = tgt
+        tgt = self.norm1(tgt)
+        tgt = residual + tgt
+
+        out = self.linear2(F.gelu(self.linear1(tgt), approximate=True))
+        return out
+
+
+class EmbeddingPipe(EmbeddingNet):
+    def forward(self, x):
+        return super().forward(x)
+
+
+class TransformerNetPipe(TransformerNet):
+    def forward(self, args):
+        x, mask = args[0], args[1]
+
+        output = super().forward(x, mask)
+        output = output
+        mask.stop_gradient = True
+        return output, mask
+
+
+class CriterionPipe(Layer):
+    def __init__(self):
+        super(CriterionPipe, self).__init__()
+
+    def forward(self, out, label):
+        loss = out.mean()
+        return loss
+
+
+class ModelPipe(PipelineLayer):
+    def __init__(self, topology):
+        self.descs = []
+        self.descs.append(LayerDesc(EmbeddingPipe))
+
+        for x in range(6):
+            self.descs.append(LayerDesc(TransformerNetPipe))
+
+        self.descs.append(lambda x: x[0])
+
+        super().__init__(
+            layers=self.descs,
+            loss_fn=CriterionPipe(),
+            topology=topology,
+            seg_method="layer:TransformerNetPipe")
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {
+            "accumulate_steps": batch_size // micro_batch_size,
+            "micro_batch_size": micro_batch_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        topology = hcg.topology()
+        set_random_seed(1024, dp_id, rank_id)
+
+        model = ModelPipe(topology)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True)
+        optimizer = paddle.optimizer.SGD(learning_rate=scheduler,
+                                         parameters=model.parameters())
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+        for step_id in range(5):
+            x_data = np.random.randint(0, vocab_size, size=[batch_size, length])
+            x = paddle.to_tensor(x_data)
+            x.stop_gradient = True
+            loss = model.train_batch([x, x], optimizer, scheduler)
+            # TODO(shenliang03) add utest for loss
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
new file mode 100644
index 0000000000000..d51976e1a1962
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_value_op_npu.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy
+import sys
+sys.path.append("..")
+
+import op_test
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.framework as framework
+import paddle.fluid.layers as layers
+
+paddle.enable_static()
+numpy.random.seed(2021)
+
+
+class TestAssignValueNPUOp(op_test.OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+
+        self.op_type = "assign_value"
+        self.inputs = {}
+        self.attrs = {}
+        self.init_data()
+
+        self.attrs["shape"] = self.value.shape
+        self.attrs["dtype"] = framework.convert_np_dtype_to_dtype_(
+            self.value.dtype)
+        self.outputs = {"Out": self.value}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
+
+    def test_forward(self):
+        self.check_output_with_place(self.place)
+
+
+class TestAssignValueNPUOp2(TestAssignValueNPUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int32)
+        self.attrs["int32_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueNPUOp3(TestAssignValueNPUOp):
+    def init_data(self):
+        self.value = numpy.random.random(size=(2, 5)).astype(numpy.int64)
+        self.attrs["int64_values"] = [int(v) for v in self.value.flat]
+
+
+class TestAssignValueNPUOp4(TestAssignValueNPUOp):
+    def init_data(self):
+        self.value = numpy.random.choice(
+            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
+
+
+class TestAssignApi(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.value = (
+            -100 + 200 * numpy.random.random(size=(2, 5))).astype(self.dtype)
+        self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
+        ) else fluid.CPUPlace()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_assign(self):
+        main_program = fluid.Program()
+        with fluid.program_guard(main_program):
+            x = layers.create_tensor(dtype=self.dtype)
+            layers.assign(input=self.value, output=x)
+
+        exe = fluid.Executor(self.place)
+        [fetched_x] = exe.run(main_program, feed={}, fetch_list=[x])
+        self.assertTrue(
+            numpy.array_equal(fetched_x, self.value),
+            "fetch_x=%s val=%s" % (fetched_x, self.value))
+        self.assertEqual(fetched_x.dtype, self.value.dtype)
+
+
+class TestAssignApi2(TestAssignApi):
+    def init_dtype(self):
+        self.dtype = "int32"
+
+
+class TestAssignApi3(TestAssignApi):
+    def init_dtype(self):
+        self.dtype = "int64"
+
+
+class TestAssignApi4(TestAssignApi):
+    def setUp(self):
+        self.init_dtype()
+        self.value = numpy.random.choice(
+            a=[False, True], size=(2, 5)).astype(numpy.bool)
+        self.place = fluid.NPUPlace(0) if fluid.core.is_compiled_with_npu(
+        ) else fluid.CPUPlace()
+
+    def init_dtype(self):
+        self.dtype = "bool"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py
new file mode 100755
index 0000000000000..02168aeb71d3e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_crop_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from test_crop_op import crop
+
+paddle.enable_static()
+np.random.seed(10)
+
+
+class TestCropOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.place = paddle.NPUPlace(0)
+        self.op_type = "crop"
+        self.attrs = {}
+        self.offset_by_input = False
+        self.crop_by_input = False
+        self.dtype = np.float32
+        self.initTestCase()
+        if self.crop_by_input:
+            self.inputs = {
+                'X': np.random.random(self.x_shape).astype(self.dtype),
+                'Y': np.random.random(self.crop_shape).astype(self.dtype)
+            }
+        else:
+            self.attrs['shape'] = self.crop_shape
+            self.inputs = {
+                'X': np.random.random(self.x_shape).astype(self.dtype),
+            }
+
+        if self.offset_by_input:
+            self.inputs['Offsets'] = np.array(self.offsets).astype('int32')
+        else:
+            self.attrs['offsets'] = self.offsets
+
+        if len(self.offsets) == 0:
+            self.offsets = np.zeros_like(self.crop_shape)
+
+        self.outputs = {
+            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
+        }
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def initTestCase(self):
+        self.x_shape = (10, 10)
+        self.crop_shape = [2, 2]
+        self.offsets = [1, 2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+
+class TestCase1(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (16, 8, 32)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 5, 3]
+
+
+class TestCase2(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (15, 8)
+        self.crop_shape = [15, 8]
+        self.offsets = [0, 0]
+
+
+class TestCase3(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 10)
+        self.crop_shape = [2, 3]
+        self.offsets = [0, 2]
+        self.offset_by_input = True
+
+
+class TestCase4(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = []
+
+
+class TestCase5(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.offset_by_input = True
+
+
+class TestCase6(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.offset_by_input = True
+        self.__class__.no_need_check_grad = True
+        self.dtype = np.float16
+
+
+class TestCase7(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.offset_by_input = True
+        self.dtype = np.int32
+
+
+class TestCase8(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = []
+        self.offset_by_input = True
+
+
+class TestCase9(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+
+
+class TestCase10(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (10, 9, 14)
+        self.crop_shape = [3, 3, 5]
+        self.offsets = [3, 5, 4]
+        self.crop_by_input = True
+        self.offset_by_input = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
old mode 100644
new mode 100755
index cb58a2a8d4409..b6e3134439d03
--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -170,6 +170,44 @@ def test_check_grad_ingore_y(self):
         pass
 
 
+class TestMul4(TestMul):
+    # case 4: (20, 2, 2, 3) * (12, 50) -> (20, 50), x_num_col_dims = 1
+    def config(self):
+        self.x_shape = (20, 2, 2, 3)
+        self.y_shape = (12, 50)
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "mul"
+        self.place = paddle.NPUPlace(0)
+        self.init_dtype()
+        self.config()
+        np.random.seed(SEED)
+        self.inputs = {
+            'X': np.random.random(self.x_shape).astype(self.dtype),
+            'Y': np.random.random(self.y_shape).astype(self.dtype)
+        }
+        self.outputs = {
+            'Out': np.dot(self.inputs['X'].reshape(20, 12), self.inputs['Y'])
+        }
+
+
+@skip_check_grad_ci(
+    reason="Don't support grad checking for NPU OP with FP16 data type.")
+class TestMul4FP16(TestMul4):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_grad_normal(self):
+        pass
+
+    def test_check_grad_ingore_x(self):
+        pass
+
+    def test_check_grad_ingore_y(self):
+        pass
+
+
 class TestMulNet(unittest.TestCase):
     def init_dtype(self):
         self.dtype = np.float32
@@ -385,5 +423,80 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
+class TestMulNet4_2(unittest.TestCase):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(12, 5)).astype(self.dtype)
+        b_np = np.random.random(size=(12, 5)).astype(self.dtype)
+        c_np = np.random.random(size=(12, 5)).astype(self.dtype)
+        d_np = np.random.random(size=(12, 5)).astype(self.dtype)
+        label_np = np.random.randint(2, size=(2, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[12, 5], dtype=self.dtype)
+            b = paddle.static.data(name="b", shape=[12, 5], dtype=self.dtype)
+            c = paddle.static.data(name="c", shape=[12, 5], dtype=self.dtype)
+            d = paddle.static.data(name="d", shape=[12, 5], dtype=self.dtype)
+            label = paddle.static.data(
+                name="label", shape=[2, 1], dtype='int64')
+
+            sum_1 = paddle.add(a, b)  # [12, 5]
+            sum_2 = paddle.add(c, d)  # [12, 5]
+            fc_1 = fluid.layers.fc(input=sum_1, size=2)  # [12, 2]
+            fc_1_re_shape = paddle.reshape(fc_1, shape=[2, 3, 2, 2])
+            fc_2 = fluid.layers.fc(input=sum_2, size=2)  # [12, 2]
+            result = paddle.fluid.layers.mul(fc_1_re_shape,
+                                             fc_2)  # [2, 3, 2, 2] * [12, 2]
+
+            prediction = fluid.layers.fc(input=result, size=2, act='softmax')
+
+            cost = fluid.layers.cross_entropy(input=prediction, label=label)
+            loss = fluid.layers.reduce_mean(cost)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("testMulNet4_2 tart run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={
+                                             "a": a_np,
+                                             "b": b_np,
+                                             "c": c_np,
+                                             "d": d_np,
+                                             "label": label_np
+                                         },
+                                         fetch_list=[prediction, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        self.init_dtype()
+        cpu_pred, cpu_loss = self._test(False)
+        npu_pred, npu_loss = self._test(True)
+
+        self.assertTrue(np.allclose(
+            npu_pred, cpu_pred, atol=1e-5))  # atol needed on cann 20.3
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
new file mode 100644
index 0000000000000..e8f5de005d421
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_reciprocal_op_npu.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest, skip_check_grad_ci
+import paddle
+paddle.enable_static()
+
+
+class TestNPUReciprocal(OpTest):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.set_npu()
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+        out = np.reciprocal(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', max_relative_error=0.01)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestNPUReciprocalFp64(TestNPUReciprocal):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+@skip_check_grad_ci(
+    reason="The backward test is not supported for float16 type on NPU.")
+class TestNPUReciprocalFp16(TestNPUReciprocal):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        self.__class__.no_need_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
new file mode 100644
index 0000000000000..b35fc9bae651a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.vision.models import resnet50
+from paddle.nn import CrossEntropyLoss
+from paddle.fluid.framework import _apply_pass
+import unittest
+
+
+class TestApplyPassToProgram(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def global_block_contains_op(self, program, op_type):
+        for op in program.global_block().ops:
+            if op.type == op_type:
+                return True
+        return False
+
+    def test_case(self):
+        image = paddle.static.data(
+            name="image", shape=[None, 3, 224, 224], dtype="float32")
+        label = paddle.static.data(name="label", shape=[None, 1], dtype="int64")
+        model = resnet50()
+        loss_fn = CrossEntropyLoss()
+        pred = model(image)
+        loss = loss_fn(pred, label)
+        optimizer = paddle.optimizer.SGD(learning_rate=1e-3)
+        optimizer.minimize(loss)
+
+        startup = paddle.static.default_startup_program()
+        main = paddle.static.default_main_program()
+
+        fused_op = "fused_elemwise_add_activation"
+        self.assertFalse(self.global_block_contains_op(main, fused_op))
+        attrs = {
+            "int_attr": -3,
+            "size_t_attr": 10,
+            "float_attr": 3.25,
+            "float32_attr": -4.5,
+            "str_attr": "any string attr value",
+        }
+        attr_types = {
+            "size_t_attr": "size_t",
+            "float32_attr": "float32",
+        }
+        ret_attrs = _apply_pass(main, startup, "fuse_elewise_add_act_pass",
+                                attrs, attr_types)
+        self.assertEqual(attrs, ret_attrs)
+        self.assertTrue(self.global_block_contains_op(main, fused_op))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
index 623b7e68b3f7f..bc280a01890d4 100644
--- a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
+++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py
@@ -115,6 +115,31 @@ def test_backward_none_grad_tensor(self):
 
                     self.assertTrue(np.allclose(x_grad, x_tensor.grad.numpy()))
 
+    def test_backward_accumulator_with_init_grad(self):
+        for dtype in self._dtypes:
+            x = np.random.random([10, ]).astype(dtype)
+            y_grad = np.random.random([10, ]).astype(dtype)
+            z_grad = np.random.random([10, ]).astype(dtype)
+            self._places = [paddle.CPUPlace()]
+            for place in self._places:
+                with dg.guard(place):
+                    x_tensor = paddle.to_tensor(x, stop_gradient=False)
+                    y_tensor = x_tensor**2
+                    z_tensor = y_tensor**3
+
+                    y_grad_tensor = paddle.to_tensor(y_grad)
+                    z_grad_tensor = paddle.to_tensor(z_grad)
+                    paddle.autograd.backward([y_tensor, z_tensor],
+                                             [y_grad_tensor, z_grad_tensor])
+
+                    y = x**2
+                    z = x**3
+                    x_grad = 2 * x_tensor * (
+                        y_grad_tensor + 3 * y_tensor * y_tensor * z_grad_tensor)
+
+                    self.assertTrue(
+                        np.allclose(x_grad.numpy(), x_tensor.grad.numpy()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index a1cd0df8d7c7e..1387827736560 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -34,7 +34,7 @@ def test_sharding_optimizer(self):
         self.set_strategy(strategy, 'sharding')
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
         parameters = [
-            x.name for x in train_prog.list_vars() if x.persistable == True
+            x.name for x in train_prog.list_vars() if x.persistable is True
         ]
         ops = [op.type for op in avg_cost.block.ops]
         vars = [x.name for x in train_prog.list_vars()]
@@ -292,7 +292,7 @@ def test_sharding_clone_for_test(self):
         ])
 
 
-class TestFleetMetaOptimizer_V1(TestFleetMetaOptimizer):
+class TestFleetShardingHybridOptimizer(TestFleetMetaOptimizer):
     def setUp(self):
         os.environ["PADDLE_TRAINER_ID"] = "3"
         os.environ[
@@ -303,7 +303,7 @@ def setUp(self):
         self.sharding_ring_id = 1
         self.dp_ring_id = 2
         self.global_ring_id = 3
-        self.pp_ring_id = 20
+        self.pp_pair_ring_id = 20
 
     def test_sharding_with_mp(self):
         # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API
@@ -336,7 +336,7 @@ def test_sharding_with_mp(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_1":
+                    0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -345,7 +345,7 @@ def test_sharding_with_mp(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_2":
+                    0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -381,7 +381,7 @@ def test_sharding_hybrid_dp(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_1":
+                    0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -390,7 +390,7 @@ def test_sharding_hybrid_dp(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_2":
+                    0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
 
@@ -450,7 +450,7 @@ def test_sharding_hybrid_dp_gm(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_1":
+                    0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -459,7 +459,7 @@ def test_sharding_hybrid_dp_gm(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_2":
+                    0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
 
@@ -530,12 +530,8 @@ def test_sharding_with_pp(self):
             'fill_constant', 'uniform_random', 'fill_constant',
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
-            'c_sync_calc_stream', 'c_gen_nccl_id', 'c_comm_init',
-            'fill_constant', 'c_allreduce_sum', 'c_sync_calc_stream',
-            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
-            'c_sync_calc_stream', 'c_gen_nccl_id', 'c_comm_init',
-            'fill_constant', 'c_allreduce_sum', 'c_sync_calc_stream'
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
         ])
 
         self.assertEqual(main_prog_op_types, [
@@ -566,13 +562,13 @@ def test_sharding_with_pp(self):
             if op.type == "c_comm_init"
         ]
         self.assertIn(self.sharding_ring_id, created_ring_ids)
-        self.assertIn(self.pp_ring_id, created_ring_ids)
+        self.assertIn(self.pp_pair_ring_id, created_ring_ids)
 
         # check correctness of pp group
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_1":
+                    0] == "comm_id_0":
                 sharding_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003'])
@@ -581,7 +577,7 @@ def test_sharding_with_pp(self):
         sharding_group_waiting_port = None
         for op in startup_prog_ops:
             if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
-                    0] == "nccl_id_2":
+                    0] == "comm_id_1":
                 dp_group_waiting_ports = op.desc.attr("other_endpoints")
 
         self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002'])
@@ -616,6 +612,86 @@ def test_sharding_dp_with_allreduce_fuse(self):
             if op.type == 'c_allreduce_sum':
                 assert 'FusedOutput' in op.input_arg_names[0]
 
+    def test_hybrid_with_mp_pp_amp_gclip(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.pp_net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'amp')
+        strategy.sharding = True
+        strategy.sharding_configs = {
+            "sharding_degree": 1,
+            "mp_degree": 2,
+            "pp_degree": 2,
+            "dp_degree": 1,
+        }
+        strategy.pipeline = True
+        strategy.pipeline_configs = {
+            "schedule_mode": "1F1B",
+            "micro_batch_size": 2,
+            "accumulate_steps": 4,
+        }
+        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        self.optimizer(
+            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        train_prog = train_prog._pipeline_opt['section_program']
+        startup_prog = startup_prog._pipeline_opt['startup_program']
+
+        startup_prog_ops = startup_prog.global_block().ops
+        main_prog_ops = train_prog.global_block().ops
+
+        # check program
+        startup_prog_op_types = [op.type for op in startup_prog_ops]
+        main_prog_op_types = [op.type for op in main_prog_ops]
+
+        # ring: mp, pp_group, pp_pair, pp_pair
+        self.assertEqual(startup_prog_op_types, [
+            'uniform_random', 'fill_constant', 'uniform_random',
+            'fill_constant', 'uniform_random', 'fill_constant',
+            'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init',
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init',
+            'c_gen_nccl_id', 'c_comm_init'
+        ])
+
+        # pp + mp, partial send recv
+        self.assertIn('partial_recv', main_prog_op_types)
+        self.assertIn('partial_allgather', main_prog_op_types)
+        self.assertIn('partial_send', main_prog_op_types)
+
+        # amp check_finite_and_unscale, allreduce(mp)->allreduce(pp)
+        self.assertEqual(main_prog_op_types.count('c_allreduce_max'), 2)
+
+        # global gradient clip, allreduce(mp)->allreduce(pp)
+        self.assertEqual(main_prog_op_types.count('c_allreduce_sum'), 2)
+
+        # should has ring id for pp
+        created_ring_ids = [
+            op.desc.attr("ring_id") for op in startup_prog_ops
+            if op.type == "c_comm_init"
+        ]
+        self.assertIn(self.mp_ring_id, created_ring_ids)
+        self.assertIn(self.pp_pair_ring_id, created_ring_ids)
+
+        # check correctness of pp group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "comm_id_0":
+                mp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(mp_group_waiting_ports, ['127.0.0.1:36003'])
+
+        # check correctness of sharding group
+        sharding_group_waiting_port = None
+        for op in startup_prog_ops:
+            if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[
+                    0] == "comm_id_1":
+                pp_group_waiting_ports = op.desc.attr("other_endpoints")
+
+        self.assertEqual(pp_group_waiting_ports, ['127.0.0.1:36002'])
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
index 9f534381c98ab..62e781678c9fc 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py
@@ -33,6 +33,9 @@ def test_hybrid_parallel_shared_weight(self):
     def test_pipeline_parallel(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_amp.py')
 
+    def test_hybrid_parallel_transformer(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index ebf7c01e2cae5..3c1a2649a7dfa 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -588,5 +588,331 @@ def test_cuda_pinned_place(self):
             self.assertFalse(y.place.is_cuda_pinned_place())
 
 
+class ArrayLayer(paddle.nn.Layer):
+    def __init__(self, input_size=224, output_size=10, array_size=1):
+        super(ArrayLayer, self).__init__()
+        self.input_size = input_size
+        self.output_size = output_size
+        self.array_size = array_size
+        for i in range(self.array_size):
+            setattr(self,
+                    self.create_name(i),
+                    paddle.nn.Linear(input_size, output_size))
+
+    def create_name(self, index):
+        return 'linear_' + str(index)
+
+    def forward(self, inps):
+        array = []
+        for i in range(self.array_size):
+            linear = getattr(self, self.create_name(i))
+            array.append(linear(inps))
+
+        tensor_array = self.create_tensor_array(array)
+
+        tensor_array = self.array_slice(tensor_array)
+
+        array1 = paddle.concat(tensor_array)
+        array2 = paddle.concat(tensor_array[::-1])
+        return array1 + array2 * array2
+
+    def get_all_grads(self, param_name='weight'):
+        grads = []
+        for i in range(self.array_size):
+            linear = getattr(self, self.create_name(i))
+            param = getattr(linear, param_name)
+
+            g = param.grad
+            if g is not None:
+                g = g.numpy()
+
+            grads.append(g)
+
+        return grads
+
+    def clear_all_grad(self):
+        param_names = ['weight', 'bias']
+        for i in range(self.array_size):
+            linear = getattr(self, self.create_name(i))
+            for p in param_names:
+                param = getattr(linear, p)
+                param.clear_gradient()
+
+    def array_slice(self, array):
+        return array
+
+    def create_tensor_array(self, tensors):
+        tensor_array = None
+        for i, tensor in enumerate(tensors):
+            index = paddle.full(shape=[1], dtype='int64', fill_value=i)
+            if tensor_array is None:
+                tensor_array = paddle.tensor.array_write(tensor, i=index)
+            else:
+                paddle.tensor.array_write(tensor, i=index, array=tensor_array)
+        return tensor_array
+
+
+class TestStridedSliceTensorArray(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def grad_equal(self, g1, g2):
+        if g1 is None:
+            g1 = np.zeros_like(g2)
+        if g2 is None:
+            g2 = np.zeros_like(g1)
+        return np.array_equal(g1, g2)
+
+    def is_grads_equal(self, g1, g2):
+        for i, g in enumerate(g1):
+
+            self.assertTrue(
+                self.grad_equal(g, g2[i]),
+                msg="gradient_1:\n{} \ngradient_2:\n{}".format(g, g2))
+
+    def is_grads_equal_zeros(self, grads):
+        for g in grads:
+            self.assertTrue(
+                self.grad_equal(np.zeros_like(g), g),
+                msg="The gradient should be zeros, but received \n{}".format(g))
+
+    def create_case(self, net):
+        inps1 = paddle.randn([1, net.input_size], dtype='float32')
+        inps2 = inps1.detach().clone()
+        l1 = net(inps1)
+        s1 = l1.numpy()
+        l1.sum().backward()
+        grads_dy = net.get_all_grads()
+        net.clear_all_grad()
+        grads_zeros = net.get_all_grads()
+
+        self.is_grads_equal_zeros(grads_zeros)
+
+        func = paddle.jit.to_static(net.forward)
+        l2 = func(inps2)
+        s2 = l2.numpy()
+        l2.sum().backward()
+        grads_static = net.get_all_grads()
+        net.clear_all_grad()
+        # compare result of dygraph and static 
+        self.is_grads_equal(grads_static, grads_dy)
+        self.assertTrue(
+            np.array_equal(s1, s2),
+            msg="dygraph graph result:\n{} \nstatic dygraph result:\n{}".format(
+                l1.numpy(), l2.numpy()))
+
+    def test_strided_slice_tensor_array_cuda_pinned_place(self):
+        if paddle.device.is_compiled_with_cuda():
+            with paddle.fluid.dygraph.guard():
+
+                class Simple(paddle.nn.Layer):
+                    def __init__(self):
+                        super(Simple, self).__init__()
+
+                    def forward(self, inps):
+                        tensor_array = None
+                        for i, tensor in enumerate(inps):
+                            index = paddle.full(
+                                shape=[1], dtype='int64', fill_value=i)
+                            if tensor_array is None:
+                                tensor_array = paddle.tensor.array_write(
+                                    tensor, i=index)
+                            else:
+                                paddle.tensor.array_write(
+                                    tensor, i=index, array=tensor_array)
+
+                        array1 = paddle.concat(tensor_array)
+                        array2 = paddle.concat(tensor_array[::-1])
+                        return array1 + array2 * array2
+
+                net = Simple()
+                func = paddle.jit.to_static(net.forward)
+
+                inps1 = paddle.to_tensor(
+                    np.random.randn(2, 10),
+                    place=paddle.CUDAPinnedPlace(),
+                    stop_gradient=False)
+                inps2 = paddle.to_tensor(
+                    np.random.randn(2, 10),
+                    place=paddle.CUDAPinnedPlace(),
+                    stop_gradient=False)
+
+                self.assertTrue(inps1.place.is_cuda_pinned_place())
+                self.assertTrue(inps2.place.is_cuda_pinned_place())
+
+                result = func([inps1, inps2])
+
+                self.assertFalse(result.place.is_cuda_pinned_place())
+
+    def test_strided_slice_tensor_array(self):
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[::-1]
+
+        self.create_case(Net(array_size=10))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[::-2]
+
+        self.create_case(Net(input_size=112, array_size=11))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[::-3]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[1::-4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[:7:-4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[8:0:-4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[8:1:-4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[::2]
+
+        self.create_case(Net(input_size=112, array_size=11))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[::3]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[1::4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[:8:4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[1:8:4]
+
+        self.create_case(Net(input_size=112, array_size=9))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[8:10:4]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[3:10:4]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[2:10:4]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[3:10:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[3:15:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[0:15:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-1:-5:-3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-1:-6:-3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-3:-6:-3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-5:-1:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-6:-1:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-6:-3:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[0::3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-60:20:3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-3:-60:-3]
+
+        self.create_case(Net(input_size=112, array_size=13))
+
+        class Net(ArrayLayer):
+            def array_slice(self, tensors):
+                return tensors[-1:-60:-3]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 9c94e3c9ab300..4b52cfceabf85 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -72,10 +72,17 @@ def _test_place(place):
                 if core.is_compiled_with_cuda():
                     y = x.pin_memory()
                     self.assertEqual(y.place.__repr__(), "CUDAPinnedPlace")
+                    y = x.cuda()
+                    y = x.cuda(None)
+                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    y = x.cuda(device_id=0)
+                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
                     y = x.cuda(blocking=False)
                     self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
                     y = x.cuda(blocking=True)
                     self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    with self.assertRaises(ValueError):
+                        y = x.cuda("test")
 
                 # support 'dtype' is core.VarType
                 x = paddle.rand((2, 2))
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 2c2a6412497a5..1ba44cea76347 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -144,13 +144,10 @@ def _getitem_impl_(var, item):
 
             step = 1 if step is None else step
 
-            if start is None and end is None:
-                assert (step == -1)
-                reverse_axes.append(dim)
-                continue
-
-            start = 0 if start is None else start
-            end = MAX_INTEGER if end is None else end
+            if start is None:
+                start = 0 if step > 0 else MAX_INTEGER
+            if end is None:
+                end = MAX_INTEGER if step > 0 else -1
 
         elif isinstance(slice_item, list):
             is_bool_list = False
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index dd088b18ca27d..32050c12ec147 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -31,11 +31,11 @@ class Adadelta(Optimizer):
 
     .. math::
 
-        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2
+        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2
 
-        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \\epsilon ) / ( E(g_t^2) + \\epsilon ) }
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }
 
-        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\_rate)^2
+        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
 
     Args:
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index 6238d32e9c49d..7ca4ab648a1f5 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -32,7 +32,7 @@ class Adagrad(Optimizer):
 
         moment\_out &= moment + grad * grad
 
-        param\_out &= param - \\frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+        param\_out &= param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
 
 
     The original paper does not have the ``epsilon`` attribute. It is added here
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index e59deb5d61bd0..e065ee91c6840 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -42,14 +42,14 @@ class Adam(Optimizer):
 
         t & = t + 1
 
-        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
 
-        moment\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+        moment\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
 
-        learning\_rate & = learning\_rate * \\
-                          \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {\\beta}_1^t}
+        learning\_rate & = learning\_rate * \
+                          \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
 
-        param\_out & = param - learning\_rate * \\frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+        param\_out & = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
 
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
 
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 867b7703720ba..de70e2e72a9c6 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -33,13 +33,13 @@ class Adamax(Optimizer):
 
         t & = t + 1
 
-        moment\_out & = {\\beta}_1 * moment + (1 - {\\beta}_1) * grad
+        moment\_out & = {\beta}_1 * moment + (1 - {\beta}_1) * grad
 
-        inf\_norm\_out & = max({\\beta}_2 * inf\_norm + \epsilon, |grad|)
+        inf\_norm\_out & = max({\beta}_2 * inf\_norm + \epsilon, |grad|)
 
-        learning\_rate & = \\frac{learning\_rate}{1 - {\\beta}_1^t}
+        learning\_rate & = \frac{learning\_rate}{1 - {\beta}_1^t}
 
-        param\_out & = param - learning\_rate * \\frac{moment\_out}{inf\_norm\_out}
+        param\_out & = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
 
     Related paper: `Adam: A Method for Stochastic Optimization <https://arxiv.org/abs/1412.6980>`_
 
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index f830a9096caa7..11ba49c0707a3 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -32,14 +32,14 @@ class AdamW(Adam):
 
         t & = t + 1
 
-        moment\_1\_out & = {\\beta}_1 * moment\_1 + (1 - {\\beta}_1) * grad
+        moment\_1\_out & = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
 
-        moemnt\_2\_out & = {\\beta}_2 * moment\_2 + (1 - {\\beta}_2) * grad * grad
+        moemnt\_2\_out & = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
 
-        learning\_rate & = learning\_rate * \\
-            \\frac{\sqrt{1 - {\\beta}_2^t}}{1 - {beta}_1^t}
+        learning\_rate & = learning\_rate * 
+            \frac{\sqrt{1 - {\beta}_2^t}}{1 - {beta}_1^t}
 
-        param\_out & = param - learning\_rate * (\\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
+        param\_out & = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
 
 
     Args:
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index 1d72c1df2b1d8..43d4d326bd7e9 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -34,17 +34,17 @@ class Lamb(Optimizer):
 
     ..  math::
 
-        m_t &= \\beta_1 m_{t - 1}+ (1 - \\beta_1)g_t
+        m_t &= \beta_1 m_{t - 1}+ (1 - \beta_1)g_t
 
-        v_t &= \\beta_2 v_{t - 1}  + (1 - \\beta_2)g_t^2
+        v_t &= \beta_2 v_{t - 1}  + (1 - \beta_2)g_t^2
 
-        m_t &= \\frac{m_t}{\\beta_1^t}
+        m_t &= \frac{m_t}{\beta_1^t}
 
-        v_t &= \\frac{v_t}{\\beta_2^t}
+        v_t &= \frac{v_t}{\beta_2^t}
 
-        r_t &= \\frac{m_t}{\\sqrt{v_t}+\\epsilon}
+        r_t &= \frac{m_t}{\sqrt{v_t}+\epsilon}
 
-        w_t &= w_{t-1} -\\eta_t \\frac{\\left \| w_{t-1}\\right \|}{\\left \| r_t + \\lambda w_{t-1}\\right \|} (r_t + \\lambda w_{t-1})
+        w_t &= w_{t-1} -\eta_t \frac{\left \| w_{t-1}\right \|}{\left \| r_t + \lambda w_{t-1}\right \|} (r_t + \lambda w_{t-1})
 
 
     where :math:`m` is the 1st moment, and :math:`v` the 2nd moment, :math:`\\eta` the
@@ -76,8 +76,8 @@ class Lamb(Optimizer):
         .. code-block:: python
             
             import paddle
-            import numpy as np
-            inp = paddle.uniform(min=-0.1, max=0.1, shape=[10, 10], dtype='float32')
+
+            inp = paddle.uniform(shape=[10, 10], dtype='float32', min=-0.1, max=0.1)
             linear = paddle.nn.Linear(10, 10)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -88,30 +88,6 @@ class Lamb(Optimizer):
             lamb.step()
             lamb.clear_grad()
 
-
-            #Note that the learning_rate of linear_2 is 0.01.
-            linear_1 = paddle.nn.Linear(10, 10)
-            linear_2 = paddle.nn.Linear(10, 10)
-            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
-            out = linear_1(inp)
-            out = linear_2(out)
-            loss = paddle.mean(out)
-            lamb = paddle.optimizer.Lamb(
-                learning_rate=0.1,
-                parameters=[{
-                    'params': linear_1.parameters()
-                }, {
-                    'params': linear_2.parameters(),
-                    'weight_decay': 0.001,
-                    'learning_rate': 0.1,
-                    'lamb_weight_decay': 0.02
-                }],
-                weight_decay=0.01,
-                lamb_weight_decay=0.01)                   
-            out.backward()
-            lamb.step()
-            lamb.clear_grad()
-
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 7cea2645fa65d..be1786696bd92 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -472,7 +472,7 @@ class InverseTimeDecay(LRScheduler):
 
     .. math::
 
-        new\_learning\_rate = \\frac{learning\_rate}{1 + gamma * epoch}
+        new\_learning\_rate = \frac{learning\_rate}{1 + gamma * epoch}
 
     Args:
         learning_rate (float): The initial learning rate. It is a python float number.
@@ -555,9 +555,9 @@ class PolynomialDecay(LRScheduler):
 
     .. math::
 
-        decay\_steps & = decay\_steps * math.ceil(\\frac{epoch}{decay\_steps}) 
+        decay\_steps & = decay\_steps * math.ceil(\frac{epoch}{decay\_steps}) 
 
-        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
     If cycle is set to False, then:
 
@@ -565,7 +565,7 @@ class PolynomialDecay(LRScheduler):
 
         epoch & = min(epoch, decay\_steps) 
 
-        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\\frac{epoch}{decay\_steps})^{power}+end\_lr
+        new\_learning\_rate & = (learning\_rate-end\_lr)*(1-\frac{epoch}{decay\_steps})^{power}+end\_lr
 
 
     Args:
@@ -676,7 +676,7 @@ class LinearWarmup(LRScheduler):
     
     .. math::
     
-            lr = start\_lr + (end\_lr - start\_lr) * \\frac{epoch}{warmup\_steps}
+            lr = start\_lr + (end\_lr - start\_lr) * \frac{epoch}{warmup\_steps}
     
     where start_lr is the initial learning rate, and end_lr is the final learning rate;
     
@@ -1407,14 +1407,13 @@ class CosineAnnealingDecay(LRScheduler):
 
     .. math::
 
-        \\begin{aligned}
-            \eta_t & = \eta_{min} + \\frac{1}{2}(\eta_{max} - \eta_{min})\left(1
-            + \cos\left(\\frac{T_{cur}}{T_{max}}\pi\\right)\\right),
-            & T_{cur} \\neq (2k+1)T_{max}; \\
-            \eta_{t+1} & = \eta_{t} + \\frac{1}{2}(\eta_{max} - \eta_{min})
-            \left(1 - \cos\left(\\frac{1}{T_{max}}\pi\\right)\\right),
-            & T_{cur} = (2k+1)T_{max}.
-        \end{aligned}
+        \eta_t & = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})\left(1
+        + \cos\left(\frac{T_{cur}}{T_{max}}\pi\right)\right),
+        & T_{cur} \neq (2k+1)T_{max}; 
+
+        \eta_{t+1} & = \eta_{t} + \frac{1}{2}(\eta_{max} - \eta_{min})
+        \left(1 - \cos\left(\frac{1}{T_{max}}\pi\right)\right),
+        & T_{cur} = (2k+1)T_{max}.
     
     It has been proposed in `SGDR: Stochastic Gradient Descent with Warm Restarts <https://arxiv.org/abs/1608.03983>`_. 
     Note that this only implements the cosine annealing part of SGDR, and not the restarts.
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 14249df3f5628..6a59052fc0255 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -30,9 +30,9 @@ class RMSProp(Optimizer):
 
     ..  math::
 
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+        r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
 
-        w & = w - \\frac{\\eta} {\\sqrt{r(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+        w & = w - \frac{\eta} {\sqrt{r(w,t) + \epsilon}} \nabla Q_{i}(w)
 
     The first equation calculates moving average of the squared gradient for
     each weight. Then dividing the gradient by :math:`sqrt{v(w,t)}`.
@@ -42,10 +42,10 @@ class RMSProp(Optimizer):
 
     ..  math::
 
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+        r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
 
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) +
-            \\epsilon}} \\nabla Q_{i}(w)
+        v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) +
+            \epsilon}} \nabla Q_{i}(w)
 
         w & = w - v(w, t)
 
@@ -53,12 +53,12 @@ class RMSProp(Optimizer):
 
     ..  math::
 
-        r(w, t) & = \\rho r(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2
+        r(w, t) & = \rho r(w, t-1) + (1 - \rho)(\nabla Q_{i}(w))^2
 
-        g(w, t) & = \\rho g(w, t-1) + (1 - \\rho)\\nabla Q_{i}(w)
+        g(w, t) & = \rho g(w, t-1) + (1 - \rho)\nabla Q_{i}(w)
 
-        v(w, t) & = \\beta v(w, t-1) + \\frac{\\eta} {\\sqrt{r(w,t) - (g(w, t))^2 +
-            \\epsilon}} \\nabla Q_{i}(w)
+        v(w, t) & = \beta v(w, t-1) + \frac{\eta} {\sqrt{r(w,t) - (g(w, t))^2 +
+            \epsilon}} \nabla Q_{i}(w)
 
         w & = w - v(w, t)
 
diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18
index ddae9e1c32aef..b7e13bb2a3e86 100644
--- a/tools/dockerfile/Dockerfile.release18
+++ b/tools/dockerfile/Dockerfile.release18
@@ -27,7 +27,6 @@ RUN apt-get update && \
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 
 COPY tools/dockerfile/build_scripts /build_scripts 
-RUN bash /build_scripts/install_trt.sh
 RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
 RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
 RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
diff --git a/tools/dockerfile/ubuntu16_release.sh b/tools/dockerfile/ubuntu16_release.sh
index 9d5d2881ccdd1..7e93bb34f9e31 100755
--- a/tools/dockerfile/ubuntu16_release.sh
+++ b/tools/dockerfile/ubuntu16_release.sh
@@ -22,7 +22,7 @@ function ref_whl(){
       ref_gpu=gpu-cuda${ref_CUDA_MAJOR}-cudnn${CUDNN_MAJOR}
       install_gpu="_gpu"
   else
-      ref_gpu="cpu-avx"
+      ref_gpu="cpu"
       install_gpu=""
   fi
   
@@ -56,7 +56,7 @@ function ref_whl(){
 
   ref_dev=2.1.0.dev0
   
-  ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
+  ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}/linux/linux-${ref_gpu}-${ref_mkl}${ref_gcc}-avx"
   
   if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
@@ -95,13 +95,6 @@ function install_gcc(){
 }
 
 
-# function install_jupyter() {
-#   if [[ ${WITH_NOTEBOOK} == "ON" ]];then
-#     # install jupyter notebook
-#   fi
-# }
-
-
 function make_dockerfile(){
   sed "s/<baseimg>/${docker_name}/g" tools/dockerfile/Dockerfile.release16 >Dockerfile.tmp
 }
@@ -110,7 +103,6 @@ function make_dockerfile(){
 function main(){
   make_dockerfile
   install_gcc
-  # install_jupyter
   ref_whl
   install_whl
 }
diff --git a/tools/dockerfile/ubuntu18_release.sh b/tools/dockerfile/ubuntu18_release.sh
index 216d8528200e5..286cb9c6919a1 100755
--- a/tools/dockerfile/ubuntu18_release.sh
+++ b/tools/dockerfile/ubuntu18_release.sh
@@ -22,7 +22,7 @@ function ref_whl(){
       ref_gpu=gpu-cuda${ref_CUDA_MAJOR}-cudnn${CUDNN_MAJOR}
       install_gpu="_gpu"
   else
-      ref_gpu="cpu-avx"
+      ref_gpu="cpu"
       install_gpu=""
   fi
   
@@ -56,7 +56,7 @@ function ref_whl(){
 
   ref_dev=2.1.0.dev0
   
-  ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
+  ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}/linux/linux-${ref_gpu}-${ref_mkl}${ref_gcc}-avx"
   
   if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then
     ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl
@@ -80,7 +80,6 @@ function install_gcc(){
   if [ "${gcc_version}" == "8.2.0" ];then
     sed -i 's#<install_gcc>#WORKDIR /usr/bin \
       COPY tools/dockerfile/build_scripts /build_scripts \
-      RUN bash /build_scripts/install_trt.sh \
       RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
       RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
       RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
@@ -96,7 +95,6 @@ function install_gcc(){
 }
 
 
-
 function make_dockerfile(){
   sed "s/<baseimg>/${docker_name}/g" tools/dockerfile/Dockerfile.release18 >Dockerfile.tmp
 }
diff --git a/tools/test_ci_model_benchmark.sh b/tools/test_ci_model_benchmark.sh
index ae2d4458efdb1..33d76dea2c5ca 100644
--- a/tools/test_ci_model_benchmark.sh
+++ b/tools/test_ci_model_benchmark.sh
@@ -20,18 +20,17 @@ function check_whl {
     [ $? -ne 0 ] && echo "build paddle failed." && exit 1
     pip uninstall -y paddlepaddle_gpu
     pip install build/python/dist/*.whl
-    mkdir build/pr_whl && cp build/python/dist/*.whl build/pr_whl
     [ $? -ne 0 ] && echo "install paddle failed." && exit 1
-
+    mkdir build/pr_whl && cp build/python/dist/*.whl build/pr_whl
     mkdir -p /tmp/pr && mkdir -p /tmp/develop
     unzip -q build/python/dist/*.whl -d /tmp/pr
     rm -f build/python/dist/*.whl && rm -f build/python/build/.timestamp
 
     git checkout .
     git checkout -b develop_base_pr upstream/$BRANCH
-    bash -x paddle/scripts/paddle_build.sh build
     [ $? -ne 0 ] && echo "install paddle failed." && exit 1
     cd build
+    make -j `nproc`
     unzip -q python/dist/*.whl -d /tmp/develop
 
     sed -i '/version.py/d' /tmp/pr/*/RECORD
@@ -40,6 +39,7 @@ function check_whl {
     if [ ${diff_whl} -eq 0 ];then
         echo "paddle whl does not diff in PR-CI-Model-benchmark, so skip this ci"
         echo "ipipe_log_param_isSkipTest_model_benchmark: 1" 
+        echo "cpu_benchmark=ON" >${cfs_dir}/model_benchmark/${AGILE_PULL_ID}/${AGILE_REVISION}/pass.txt
         exit 0
     else
         echo "ipipe_log_param_isSkipTest_model_benchmark: 0"
@@ -47,7 +47,7 @@ function check_whl {
 }
 
 function compile_install_paddle {
-    export CUDA_ARCH_NAME=Auto
+    export CUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto}
     export PY_VERSION=3.7
     export WITH_DISTRIBUTE=OFF
     export WITH_GPU=ON