diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f9e3a3dcdd2..d874b21b0873d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.15)
+cmake_minimum_required(VERSION 3.10)
 cmake_policy(VERSION 3.10)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
@@ -22,9 +22,6 @@ include(system)
 
 project(paddle CXX C)
 
-include(init)
-include(generic)            # simplify cmake module
-
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
@@ -34,10 +31,14 @@ option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
 option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
 option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
 option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
-# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON 
+# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
 option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
 option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
+# Note(zhouwei): It use option above, so put here
+include(init)
+include(generic)            # simplify cmake module
+
 if (WITH_GPU  AND WITH_XPU)
     message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
@@ -65,7 +66,7 @@ if(WITH_MUSL)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
 endif()
 
-if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
@@ -103,9 +104,11 @@ if(WIN32)
             endif()
         endforeach(flag_var)
     endif()
-    
-    # NOTE(Avin0323): Less parallel count result in faster compilation.
-    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+
+    # NOTE(zhouwei25): temporarily change MP to 1 for reducing CPU & memory utilization
+    set(PROCESS_MAX 1)
+    #math(EXPR PROCESS_MAX "${CPU_CORES} * 1 / 2")
+
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -133,6 +136,9 @@ if(WIN32)
 
     foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
         set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+        if(MSVC_STATIC_CRT)
+            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+        endif()
     endforeach(flag_var)
 
     if (WITH_WIN_DUMP_DBG)
@@ -182,7 +188,6 @@ option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
 option(WITH_XBYAK       "Compile with xbyak support"                    ON)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
 option(WITH_HETERPS     "Compile with heterps"                          OFF})
 option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
@@ -199,6 +204,7 @@ option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
+option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -259,9 +265,6 @@ endif()
 
 if(WITH_BRPC_RDMA)
     message(STATUS "Use brpc with rdma.")
-    if(WITH_GRPC)
-        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
-    endif()
     if(NOT WITH_DISTRIBUTE)
         message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
     endif()
@@ -366,6 +369,13 @@ else()
     message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
 endif()
 
+if(WITH_STRIP)
+    find_program(STRIP_PATH strip)
+    if(NOT STRIP_PATH OR NOT LINUX)
+        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
+    endif()
+endif()
+
 add_subdirectory(paddle)
 if(WITH_PYTHON)
     add_subdirectory(python)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index bf1352d4e1147..e7f125269be1f 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -177,10 +177,6 @@ if(WITH_HETERPS)
     add_definitions(-DPADDLE_WITH_HETERPS)
 endif()
 
-if(WITH_GRPC)
-    add_definitions(-DPADDLE_WITH_GRPC)
-endif(WITH_GRPC)
-
 if(WITH_BRPC_RDMA)
     add_definitions(-DPADDLE_WITH_BRPC_RDMA)
 endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
index 13676ec910b05..414b2a54be034 100644
--- a/cmake/external/ascend.cmake
+++ b/cmake/external/ascend.cmake
@@ -26,7 +26,8 @@ if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_str
   add_definitions(-DPADDLE_WITH_ASCEND_STRING)
 endif()
 
-if(WITH_ASCEND)
+
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64)
   set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common)
   set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share)
@@ -49,7 +50,6 @@ if(WITH_ASCEND)
   INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR})
 
 
-
   ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib})
 
@@ -65,6 +65,7 @@ endif()
 if(WITH_ASCEND_CL)
   set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64)
 
+  set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so)
   set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so)
   set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so)
   set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include)
@@ -78,6 +79,9 @@ if(WITH_ASCEND_CL)
   ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib})
 
+  ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL)
+  SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib})
+
   ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL)
   SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib})
   add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c85654a5674a0..a5c74a46631e9 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -447,9 +447,20 @@ function(cc_test TARGET_NAME)
     cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test"        OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -807,7 +818,7 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
-    
+
     if (WIN32)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 4864e04fa0516..9694a7bc59c12 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
   set(paddle_inference_c_lib $<TARGET_FILE_DIR:paddle_inference_c>/paddle_inference_c.*)
 else(WIN32)
-  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*)
+  set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*)
 endif(WIN32)
 
 copy(inference_lib_dist
-      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_inference_c_lib}
+      SRCS  ${src_dir}/inference/capi_exp/pd_*.h  ${paddle_inference_c_lib}
       DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)
 
 # fluid library for both train and inference
diff --git a/cmake/init.cmake b/cmake/init.cmake
index 19fdb6c601a11..b11156d2e9986 100644
--- a/cmake/init.cmake
+++ b/cmake/init.cmake
@@ -18,10 +18,10 @@ if(NOT WIN32)
     set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG")
     set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
 else()
-    # It has not been used now, it can specify CUDA compile flag manualy,
+    # It can specify CUDA compile flag manualy,
     # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous
     # because CUDA will update by nvidia, then error will occur.
-    # Now, it's used in CUDA:[10.0, 10.2]
+    # Now, it's only used in VS2015 + CUDA:[10.0, 10.2]
     set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props)
 endif()
 
diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 296940dc3f50c..3c069bd2981c4 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)"
         </ClCompile>
     </ItemDefinitionGroup>
 </Project>
-
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 81fa7d0dfa98f..f90fa3509d63d 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -29,9 +29,9 @@ set(third_party_deps)
 # 2. REPOSITORY:    specify git REPOSITORY of 3rd party
 # 3. TAG:           specify git tag/branch/commitID of 3rd party
 # 4. DIR:           overwrite the original SOURCE_DIR when cache directory
-# 
+#
 # The function Return 1 PARENT_SCOPE variables:
-#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, 
+#  - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add,
 #                            and you no longer need to set any donwnload steps in ExternalProject_Add.
 # For example:
 #    Cache_third_party(${TARGET}
@@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET)
         SET(${TARGET_NAME}_DOWNLOAD_CMD
                 GIT_REPOSITORY  ${cache_third_party_REPOSITORY})
         IF(cache_third_party_TAG)
-            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD  
+            LIST(APPEND   ${TARGET_NAME}_DOWNLOAD_CMD
                     GIT_TAG     ${cache_third_party_TAG})
         ENDIF()
     ELSEIF(cache_third_party_URL)
@@ -130,7 +130,7 @@ ENDFUNCTION()
 # Correction of flags on different Platform(WIN/MAC) and Print Warning Message
 if (APPLE)
     if(WITH_MKL)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.")
         set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE)
     endif()
@@ -141,7 +141,7 @@ if(WIN32 OR APPLE)
     SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE)
 
     if(WITH_LIBXSMM)
-        MESSAGE(WARNING 
+        MESSAGE(WARNING
             "Windows, Mac are not supported with libxsmm in Paddle yet."
             "Force WITH_LIBXSMM=OFF")
         SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE)
@@ -276,7 +276,7 @@ endif(WITH_BOX_PS)
 
 if(WITH_ASCEND OR WITH_ASCEND_CL)
     include(external/ascend)
-    if(WITH_ASCEND)
+    if(WITH_ASCEND OR WITH_ASCEND_CL)
         list(APPEND third_party_deps extern_ascend)
     endif()
     if(WITH_ASCEND_CL)
@@ -290,7 +290,7 @@ if (WITH_PSCORE)
 
     include(external/leveldb)
     list(APPEND third_party_deps extern_leveldb)
-        
+
     include(external/brpc)
     list(APPEND third_party_deps extern_brpc)
 
diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc
new file mode 100644
index 0000000000000..b4f42dab6790b
--- /dev/null
+++ b/go/demo/mobilenet_c_exp.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <pd_inference_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void ReadData(float* data, int size);
+
+int main(int argc, char* argv[]) {
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__");
+  PD_ConfigDisableGlogInfo(config);
+
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  // config has destroyed in PD_PredictorCreate
+  config = NULL;
+
+  int input_num = PD_PredictorGetInputNum(predictor);
+  printf("Input num: %d\n", input_num);
+  int output_num = PD_PredictorGetOutputNum(predictor);
+  printf("Output num: %d\n", output_num);
+
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* input_tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  input_names = NULL;
+
+  int32_t shape[] = {1, 3, 300, 300};
+  float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300);  // NOLINT
+  ReadData(data, 1 * 3 * 300 * 300);                                // NOLINT
+  PD_TensorReshape(input_tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(input_tensor, data);
+  free(data);
+  data = NULL;
+  PD_PredictorRun(predictor);
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayCstrDestroy(output_names);
+  output_names = nullptr;
+
+  PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor);
+  int32_t size = 1;
+  for (size_t index = 0; index < out_shape->size; ++index) {
+    size = size * out_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(out_shape);
+  out_shape = NULL;
+
+  data = (float*)malloc(sizeof(float) * size);  // NOLINT
+  PD_TensorCopyToCpuFloat(output_tensor, data);
+  free(data);
+  data = NULL;
+
+  PD_TensorDestroy(output_tensor);
+  output_tensor = NULL;
+  PD_TensorDestroy(input_tensor);
+  input_tensor = NULL;
+  PD_PredictorDestroy(predictor);
+  predictor = NULL;
+
+  return 0;
+}
+
+void ReadData(float* data, int n) {
+  FILE* fp = fopen("data/data.txt", "r");
+  for (int i = 0; i < n; i++) {
+    fscanf(fp, "%f", &data[i]);
+  }
+  fclose(fp);
+}
diff --git a/paddle/extension.h b/paddle/extension.h
index 71469576853a3..98d4bfd0326c5 100644
--- a/paddle/extension.h
+++ b/paddle/extension.h
@@ -15,4 +15,4 @@ limitations under the License. */
 #pragma once
 
 // All paddle apis in C++ frontend
-#include "paddle/fluid/extension/include/ext_all.h"
+#include "paddle/extension/include/ext_all.h"
diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h
index e185f23e3d240..c6657be96ba44 100644
--- a/paddle/fluid/distributed/service/graph_py_service.h
+++ b/paddle/fluid/distributed/service/graph_py_service.h
@@ -54,19 +54,7 @@ class GraphPyService {
   std::vector<std::string> table_feat_conf_feat_dtype;
   std::vector<int32_t> table_feat_conf_feat_shape;
 
-  // std::thread *server_thread, *client_thread;
-
-  // std::shared_ptr<paddle::distributed::PSServer> pserver_ptr;
-
-  // std::shared_ptr<paddle::distributed::PSClient> worker_ptr;
-
  public:
-  // std::shared_ptr<paddle::distributed::PSServer> get_ps_server() {
-  //   return pserver_ptr;
-  // }
-  // std::shared_ptr<paddle::distributed::PSClient> get_ps_client() {
-  //   return worker_ptr;
-  // }
   int get_shard_num() { return shard_num; }
   void set_shard_num(int shard_num) { this->shard_num = shard_num; }
   void GetDownpourSparseTableProto(
diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc
index 020bcdcc52ef4..0dc99de1bfe82 100644
--- a/paddle/fluid/distributed/table/common_graph_table.cc
+++ b/paddle/fluid/distributed/table/common_graph_table.cc
@@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) {
 
 int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) {
   auto paths = paddle::string::split_string<std::string>(path, ";");
-  int count = 0;
+  int64_t count = 0;
   std::string sample_type = "random";
   bool is_weighted = false;
   int valid_count = 0;
diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h
index 8ddf3c8f904a6..b18da82abe61c 100644
--- a/paddle/fluid/distributed/table/common_graph_table.h
+++ b/paddle/fluid/distributed/table/common_graph_table.h
@@ -33,26 +33,11 @@ namespace paddle {
 namespace distributed {
 class GraphShard {
  public:
-  // static int bucket_low_bound;
-  // static int gcd(int s, int t) {
-  //   if (s % t == 0) return t;
-  //   return gcd(t, s % t);
-  // }
   size_t get_size();
   GraphShard() {}
-  GraphShard(int shard_num) {
-    this->shard_num = shard_num;
-    // bucket_size = init_bucket_size(shard_num);
-    // bucket.resize(bucket_size);
-  }
+  GraphShard(int shard_num) { this->shard_num = shard_num; }
   std::vector<Node *> &get_bucket() { return bucket; }
   std::vector<Node *> get_batch(int start, int end, int step);
-  // int init_bucket_size(int shard_num) {
-  //   for (int i = bucket_low_bound;; i++) {
-  //     if (gcd(i, shard_num) == 1) return i;
-  //   }
-  //   return -1;
-  // }
   std::vector<uint64_t> get_ids_by_range(int start, int end) {
     std::vector<uint64_t> res;
     for (int i = start; i < end && i < bucket.size(); i++) {
@@ -64,7 +49,6 @@ class GraphShard {
   FeatureNode *add_feature_node(uint64_t id);
   Node *find_node(uint64_t id);
   void add_neighboor(uint64_t id, uint64_t dst_id, float weight);
-  // std::unordered_map<uint64_t, std::list<GraphNode *>::iterator>
   std::unordered_map<uint64_t, int> get_node_location() {
     return node_location;
   }
@@ -131,7 +115,7 @@ class GraphTable : public SparseTable {
  protected:
   std::vector<GraphShard> shards;
   size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num;
-  const int task_pool_size_ = 11;
+  const int task_pool_size_ = 24;
   const int random_sample_nodes_ranges = 3;
 
   std::vector<std::string> feat_name;
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
index 117841f80cf47..259901c09f3e0 100644
--- a/paddle/fluid/framework/custom_operator.h
+++ b/paddle/fluid/framework/custom_operator.h
@@ -28,5 +28,8 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
 void RegisterOperatorWithMetaInfoMap(
     const paddle::OpMetaInfoMap& op_meta_info_map);
 
+// Interface for selective register custom op.
+void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 628b9f0d70f59..a49e492e48028 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -638,7 +638,8 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index a539a5d5f96b5..5780a95343385 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker);
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
old mode 100755
new mode 100644
index e6a7d74cc4343..654b88920acaf
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -139,6 +139,10 @@ message PipelineConfig {
   optional string schedule_mode = 3 [ default = '1F1B' ];
 }
 
+message TensorParallelConfig {
+  optional int32 tensor_parallel_degree = 1 [ default = 1 ];
+}
+
 message DistributedStrategy {
   // bool options
   optional Mode mode = 1 [ default = COLLECTIVE ];
@@ -169,6 +173,7 @@ message DistributedStrategy {
   optional bool sharding = 26 [ default = false ];
   optional float last_comm_group_size_MB = 27 [ default = 1 ];
   optional bool find_unused_parameters = 28 [ default = true ];
+  optional bool tensor_parallel = 29 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -182,6 +187,7 @@ message DistributedStrategy {
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
   optional ShardingConfig sharding_configs = 111;
   optional HybridConfig hybrid_configs = 112;
+  optional TensorParallelConfig tensor_parallel_configs = 113;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index c8517b9503741..03dd2cff655c0 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -43,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte
 
 cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
 
-if(WITH_ASCEND)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
     cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph)
-endif(WITH_ASCEND)
+endif()
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc
index d1b2f51f70036..273939f6bee61 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h
index baa2fd126a4b7..f749ee8cfa0ba 100644
--- a/paddle/fluid/framework/fleet/ascend_wrapper.h
+++ b/paddle/fluid/framework/fleet/ascend_wrapper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <glog/logging.h>
 
 #include <map>
@@ -29,7 +29,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/timer.h"
 
 #include "ge/ge_api.h"
-#include "ge/ge_api_types.h"
 #include "graph/attr_value.h"
 #include "graph/tensor.h"
 #include "graph/types.h"
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 59d071e103459..48f79e63b4f0e 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -290,10 +290,20 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
       ids.push_back(inner_pattern_ins[js[iter]].first->Name());
       embs.push_back(inner_pattern_ins[js[iter]].second->Name());
     }
+
     OpDesc new_op_desc;
     new_op_desc.SetType("fused_embedding_eltwise_layernorm");
     new_op_desc.SetInput("Ids", ids);
     new_op_desc.SetInput("Embs", embs);
+
+    new_op_desc.SetInput("WordId", {ids[0]});
+    new_op_desc.SetInput("PosId", {ids[1]});
+    new_op_desc.SetInput("SentId", {ids[2]});
+
+    new_op_desc.SetInput("WordEmbedding", {embs[0]});
+    new_op_desc.SetInput("PosEmbedding", {embs[1]});
+    new_op_desc.SetInput("SentEmbedding", {embs[2]});
+
     new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()});
     new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()});
     new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()});
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index 0a70440765d44..25bf03f426a1d 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const {
                   "hard_shrink", "hard_sigmoid", "relu6",
                   "soft_relu",   "swish",        "thresholded_relu",
                   "log",         "square",       "softplus",
-                  "softsign"};
+                  "softsign",    "silu"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 5043fce8885cd..2fc39fd25d56c 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
                quantized_op_type == "depthwise_conv2d") {
       PADDLE_ENFORCE_EQ(
           dequant_type, "fake_channel_wise_dequantize_max_abs",
-          platform::errors::InvalidArgument("conv2d op must be dequantized by "
-                                            "[fake_channel_wise_dequantize_max_"
-                                            "abs], but got %s",
-                                            dequant_type));
+          platform::errors::InvalidArgument(
+              "conv2d op must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s. "
+              "If you uses PaddleSlim to generate the quantized "
+              "model, please set the 'weight_quantize_type' params as "
+              "'channel_wise_abs_max' and generate the quantized model again.",
+              dequant_type));
       PADDLE_ENFORCE_EQ(
           weight_scale.size(), static_cast<size_t>(w_dims[0]),
           platform::errors::InvalidArgument(
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 3a79452e230ef..0a6b5e44452fe 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
   TensorToStream(os, static_cast<Tensor>(tensor), dev_ctx);
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  auto place = tensor.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, tensor, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext *dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, tensor, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
                            const platform::DeviceContext &dev_ctx,
                            const size_t &seek,
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 6b6112f1f3efa..6b357aba1c5f9 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -255,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
 
 LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
+
+void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index dd93639f31908..73a699b41c8e0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -27,7 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
@@ -631,141 +630,15 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   InitP2P(places);
   ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
                                  member_->places_.size());
-  member_->use_device_ = exec_strategy.use_device_;
-  member_->build_strategy_ = build_strategy;
-  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
-                             BuildStrategy::ReduceStrategy::kAllReduce;
-  member_->nranks_ = build_strategy.num_trainers_ * places.size();
-  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
-    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
-                 "the number of places should be greater than 1.";
-    member_->build_strategy_.reduce_ =
-        BuildStrategy::ReduceStrategy::kAllReduce;
-    member_->use_all_reduce_ = true;
-  }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::Unavailable("Windows can support Single GPU only."));
-  }
-#endif
-
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
-  if (member_->IsUseCUDA(member_->use_device_)) {
-    PADDLE_ENFORCE_EQ(
-        places.size(), 1,
-        platform::errors::PermissionDenied(
-            "Your machine has multiple cards, "
-            "but the WITH_NCCL option is not turned on during compilation, "
-            "and you cannot use multi-card training or prediction. "
-            "Please recompile and turn on the WITH_NCCL option."));
-  }
-#endif
-
-  std::string device_name;
-  if (member_->use_device_ == p::kCPU) {
-    device_name = "CPU";
-  } else if (member_->use_device_ == p::kCUDA) {
-    device_name = "CUDA";
-  } else {
-    device_name = "XPU";
-  }
-
-  VLOG(1) << string::Sprintf(
-      "The Program will be executed on %s using ParallelExecutor, %lu "
-      "cards are used, so %lu programs are executed in parallel.",
-      device_name, places.size(), places.size());
-
-  // Step 1. Bcast the bcast_vars to devs.
-  // Create local scopes
-  if (local_scopes.empty()) {
-    member_->own_local_scope_ = true;
-    member_->local_scopes_.emplace_back(member_->global_scope_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&scope->NewScope());
-    }
-  } else {
-    member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
-                      platform::errors::PreconditionNotMet(
-                          "member_->places_.size() = %d is not equal to "
-                          "local_scopes.size() = %d",
-                          member_->places_.size(), local_scopes.size()));
-    for (size_t i = 0; i < member_->places_.size(); ++i) {
-      member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
-    }
-  }
-
-  std::vector<ir::Graph *> graphs;
-  if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
-                      platform::errors::Unavailable(
-                          "gpu mode does not support async_mode_ now!"));
-    graphs.push_back(graph);
-    for (size_t i = 1; i < places.size(); ++i) {
-      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
-      async_graphs_.emplace_back(tmp_graph);
-      graphs.push_back(tmp_graph);
-    }
-  }
-
-  // FIXME(Yancey1989): parallel graph mode get better performance
-  // in GPU allreduce distributed training. Need an elegant way to
-  // choice the execution strategy.
-  member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(*graph, exec_strategy,
-                                   member_->build_strategy_);
-  if (member_->build_strategy_.enable_parallel_graph_) {
-    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
-                 "Execution which can get better performance,"
-              << "you can force it off by env FLAGS_enable_parallel_graph=0";
-  }
+  // Initialize necessary info of member_ with strategy.
+  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(),
+                                *graph);
 
-  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_);
-
-    // Initialize device context's nccl comm, will be used by normal
-    // Operators like sync_batch_norm, and collective ops.
-    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
-    // be rewrite and there will be some problem.
-    // NOTE: NCCL group-calls and non-group-calls can not use the same
-    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
-    // same communicators.
-    auto *nccl_ctxs =
-        member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_nccl_comm(nccl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
-#endif
-  }
-  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_XPU_BKCL)
-    member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_);
+  // Step 1. Create local scopes and Clone graph into multi device
+  CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
+  std::vector<ir::Graph *> graphs = CloneGraphToMultiDevices(graph);
+  PrepareNCCLCommunicator(scope);
 
-    auto *bkcl_ctxs =
-        member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_);
-    auto &pool = platform::DeviceContextPool::Instance();
-    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
-      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
-          pool.Get(member_->places_[dev_id]));
-      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
-      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
-    }
-#else
-    PADDLE_THROW(
-        platform::errors::PreconditionNotMet("Not compiled with XPU."));
-#endif
-  }
   // broadcast parameters from the 0th device to others:
   auto need_broadcast = [&]() -> bool {
     if (member_->build_strategy_.num_trainers_ > 1) {
@@ -778,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     }
     return false;
   };
-  // Bcast Parameters to all GPUs
   if (need_broadcast()) {
     BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_);
   }
 
-  // Startup Program has been run. All local scopes has correct parameters.
-
   // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
   // ncclOp
-  std::vector<ir::Graph *> async_graphs(places.size());
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->nccl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->nccl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
-  }
-#elif defined(PADDLE_WITH_XPU_BKCL)
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->bkcl_ctxs_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->bkcl_ctxs_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
-  }
-#else
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_);
-    for (size_t i = 1; i < member_->places_.size(); ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_);
-      async_graphs[i] = graphs[i];
-    }
-  } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_);
-  }
-#endif
-
+  std::vector<ir::Graph *> async_graphs =
+      CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name);
   graph = member_->ApplyMemoryOptimizePass(graph);
-
   async_graphs[0] = graph;
 
   // Step 3. Create vars in each scope. Passes may also create new vars.
   //         skip control vars and empty vars
   std::vector<details::VariableInfo> var_infos;
-  for (auto &node : graph->Nodes()) {
-    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
-      var_infos.emplace_back();
-      var_infos.back().name_ = node->Var()->Name();
-      var_infos.back().type_ = node->Var()->GetType();
-      var_infos.back().persistable_ = node->Var()->Persistable();
-
-      member_->is_persistable_.emplace(node->Var()->Name(),
-                                       node->Var()->Persistable());
-    }
-  }
-
-  if (graph->Has(details::kFusedVars)) {
-    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
-    for (auto &fused_var : fused_vars) {
-      var_infos.emplace_back();
-      var_infos.back() = fused_var.second;
+  CreateVariableInfos(&var_infos, graph);
+  std::unordered_map<Scope *, Scope *> scope_map =
+      CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true);
 
-      member_->is_persistable_.emplace(fused_var.first,
-                                       fused_var.second.persistable_);
-    }
-  }
+  // Step 4. Create SSAGraph executor
+  std::vector<ir::Graph *> final_graphs =
+      CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph);
 
-  std::unordered_map<Scope *, Scope *> scope_map;
-  for (auto *scope : member_->local_scopes_) {
-    auto &local_exec_scope = scope->NewScope();
-    member_->local_exec_scopes_.emplace_back(&local_exec_scope);
-    scope_map.emplace(scope, &local_exec_scope);
+  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
+  if (!member_->build_strategy_.async_mode_) {
+    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        std::move(var_infos), member_->places_, std::move(member_->executor_)));
   }
 
-  PADDLE_ENFORCE_EQ(
-      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
-      platform::errors::PreconditionNotMet(
-          "member_->local_scopes_.size() = %d is not equal to "
-          "member_->local_exec_scopes_.size() = %d",
-          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+  ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
+  SetReaderOpDeviceInfoOfGraphs(final_graphs);
+}
 
-  std::vector<ir::Graph *> final_graphs;
+void ParallelExecutor::BCastParamsToDevices(
+    const std::vector<std::string> &vars, int trainer_id) const {
+  VLOG(3) << "BCastParamsToDevices";
+  // the initializing bcast, all vars would be bcast from device(0).
+  for (auto &var : vars) {
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
+      continue;
+    }
 
-  if (member_->build_strategy_.async_mode_) {
-    VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, async_graphs));
-    final_graphs = async_graphs;
-  } else if (member_->build_strategy_.enable_parallel_graph_) {
-    VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    // TODO(Yancey1989): Remove passing in the main_program when
-    // allreduce_seq_pass doesn't need it as the attr.
-    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto &main_tensor = main_var->Get<LoDTensor>();
+    if (!main_tensor.IsInitialized()) {
+      VLOG(3) << "one in var not inited, return!";
+      continue;
+    }
+    auto &dims = main_tensor.dims();
+    if (paddle::platform::is_gpu_place(main_tensor.place())) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      std::vector<void *> buffers;
+      buffers.reserve(member_->places_.size());
+      size_t numel = main_tensor.numel();
+      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        auto place = member_->places_[i];
+        void *buffer;
 
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
-    final_graphs = pg_exe->Graphs();
-    member_->executor_.reset(pg_exe);
-
-    if (is_inference && member_->places_.size() > 1) {
-      member_->inference_executor_ = pg_exe;
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-    }
-#else
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
-#endif
-  } else {
-    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
-    auto possible_inference_graphs =
-        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
-    if (!possible_inference_graphs.empty()) {
-      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
-      auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, std::move(possible_inference_graphs));
-      if (!has_drop_last_read_op) {
-        VLOG(5) << "Enable partial feed support in inference phase";
-        pg_exe->EnablePartialFeedSupport();
-      }
-      final_graphs = pg_exe->Graphs();
-      member_->executor_.reset(pg_exe);
-      member_->inference_executor_ = pg_exe;
-    } else {
-      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
-          << "drop_last=False for DataLoader is not supported in training "
-             "network. It is automatically turned to drop_last=True.";
-      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
-        VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
-      } else {
-        if (member_->use_device_ == p::kXPU) {
-#if defined(PADDLE_WITH_XPU)
-          VLOG(3) << "use BindThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-#else
-          PADDLE_THROW(platform::errors::PermissionDenied(
-              "Paddle can't use XPU device since it's not compiled with XPU,"
-              "Please recompile or reinstall Paddle with XPU support."));
-#endif
-        } else {
-          VLOG(3) << "use FastThreadedSSAGraphExecutor";
-          member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
-        }
-      }
-      final_graphs.emplace_back(graph);
-    }
-  }
-
-  VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
-  if (!member_->build_strategy_.async_mode_) {
-    member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
-  }
-
-  for (auto *g : final_graphs) {
-    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
-    for (auto *op : ops) {
-      op->SetLocalExecScopes(scope_map);
-    }
-  }
-
-  if (final_graphs.size() == 1) {
-    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
-  } else {
-    for (size_t i = 0; i < final_graphs.size(); ++i) {
-      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
-    }
-  }
-}
-
-void ParallelExecutor::BCastParamsToDevices(
-    const std::vector<std::string> &vars, int trainer_id) const {
-  VLOG(3) << "BCastParamsToDevices";
-  // the initializing bcast, all vars would be bcast from device(0).
-  for (auto &var : vars) {
-    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
-    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
-      continue;
-    }
-
-    auto &main_tensor = main_var->Get<LoDTensor>();
-    if (!main_tensor.IsInitialized()) {
-      VLOG(3) << "one in var not inited, return!";
-      continue;
-    }
-    auto &dims = main_tensor.dims();
-    if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      std::vector<void *> buffers;
-      buffers.reserve(member_->places_.size());
-      size_t numel = main_tensor.numel();
-      ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        auto place = member_->places_[i];
-        void *buffer;
-
-        if (i == 0 && trainer_id == 0) {
-          buffer = const_cast<void *>(main_tensor.data<void>());
-        } else {
-          auto local_scope = member_->local_scopes_[i];
-          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
-          t->Resize(dims);
-          buffer = t->mutable_data(place, main_tensor.type());
-        }
-        buffers.push_back(buffer);
-      }
+        if (i == 0 && trainer_id == 0) {
+          buffer = const_cast<void *>(main_tensor.data<void>());
+        } else {
+          auto local_scope = member_->local_scopes_[i];
+          auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
+          t->Resize(dims);
+          buffer = t->mutable_data(place, main_tensor.type());
+        }
+        buffers.push_back(buffer);
+      }
 
       PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
                         platform::errors::PreconditionNotMet(
@@ -1367,6 +1058,399 @@ bool ParallelExecutor::EnableParallelGraphExecution(
   return enable_parallel_graph;
 }
 
+void ParallelExecutor::InitExecutorPrivateMemberInfo(
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
+    size_t device_count, const ir::Graph &graph) {
+  member_->use_device_ = exec_strategy.use_device_;
+  member_->build_strategy_ = build_strategy;
+  member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
+                             BuildStrategy::ReduceStrategy::kAllReduce;
+  member_->nranks_ = build_strategy.num_trainers_ * device_count;
+  if (!member_->use_all_reduce_ && member_->nranks_ == 1) {
+    LOG(INFO) << "If you set build_strategy.reduce with 'Reduce',"
+                 "the number of places should be greater than 1.";
+    member_->build_strategy_.reduce_ =
+        BuildStrategy::ReduceStrategy::kAllReduce;
+    member_->use_all_reduce_ = true;
+  }
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::Unavailable("Windows can support Single GPU only."));
+  }
+#endif
+
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
+  if (member_->IsUseCUDA(member_->use_device_)) {
+    PADDLE_ENFORCE_EQ(
+        device_count, 1,
+        platform::errors::PermissionDenied(
+            "Your machine has multiple cards, "
+            "but the WITH_NCCL option is not turned on during compilation, "
+            "and you cannot use multi-card training or prediction. "
+            "Please recompile and turn on the WITH_NCCL option."));
+  }
+#endif
+
+  std::string device_name;
+  if (member_->use_device_ == p::kCPU) {
+    device_name = "CPU";
+  } else if (member_->use_device_ == p::kCUDA) {
+    device_name = "CUDA";
+  } else {
+    device_name = "XPU";
+  }
+
+  VLOG(1) << string::Sprintf(
+      "The Program will be executed on %s using ParallelExecutor, %lu "
+      "cards are used, so %lu programs are executed in parallel.",
+      device_name, device_count, device_count);
+
+  // FIXME(Yancey1989): parallel graph mode get better performance
+  // in GPU allreduce distributed training. Need an elegant way to
+  // choice the execution strategy.
+  member_->build_strategy_.enable_parallel_graph_ =
+      EnableParallelGraphExecution(graph, exec_strategy,
+                                   member_->build_strategy_);
+  if (member_->build_strategy_.enable_parallel_graph_) {
+    LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
+                 "Execution which can get better performance,"
+              << "you can force it off by env FLAGS_enable_parallel_graph=0";
+  }
+}
+
+void ParallelExecutor::CreateLocalScopes(
+    Scope *global_scope, const std::vector<Scope *> &local_scopes,
+    bool create_new) {
+  if (local_scopes.empty()) {
+    member_->own_local_scope_ = true;
+    member_->local_scopes_.emplace_back(global_scope);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      member_->local_scopes_.emplace_back(&global_scope->NewScope());
+    }
+  } else {
+    member_->own_local_scope_ = false;
+    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
+                      platform::errors::PreconditionNotMet(
+                          "member_->places_.size() = %d is not equal to "
+                          "local_scopes.size() = %d",
+                          member_->places_.size(), local_scopes.size()));
+    for (size_t i = 0; i < member_->places_.size(); ++i) {
+      if (create_new) {
+        member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
+      } else {
+        // Use local scopes directly
+        member_->local_scopes_.emplace_back(local_scopes[i]);
+      }
+    }
+  }
+}
+
+std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
+    const std::vector<Scope *> &local_scopes, bool create_new) {
+  std::unordered_map<Scope *, Scope *> scope_map;
+
+  for (auto *scope : local_scopes) {
+    Scope *local_exec_scope = scope;
+    if (create_new) {
+      local_exec_scope = &scope->NewScope();
+    }
+    member_->local_exec_scopes_.emplace_back(local_exec_scope);
+    scope_map.emplace(scope, local_exec_scope);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
+      platform::errors::PreconditionNotMet(
+          "member_->local_scopes_.size() = %d is not equal to "
+          "member_->local_exec_scopes_.size() = %d",
+          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+
+  return scope_map;
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
+    ir::Graph *graph) {
+  std::vector<ir::Graph *> graphs;
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
+                      platform::errors::Unavailable(
+                          "gpu mode does not support async_mode_ now!"));
+    graphs.push_back(graph);
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
+      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
+      async_graphs_.emplace_back(tmp_graph);
+      graphs.push_back(tmp_graph);
+    }
+  }
+
+  return graphs;
+}
+
+void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
+  if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
+
+    // Initialize device context's nccl comm, will be used by normal
+    // Operators like sync_batch_norm, and collective ops.
+    // NOTE: more than one ParallelExecutor with same place, the nccl comm will
+    // be rewrite and there will be some problem.
+    // NOTE: NCCL group-calls and non-group-calls can not use the same
+    // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use
+    // same communicators.
+    auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_nccl_comm(nccl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
+#endif
+  }
+  if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) {
+#if defined(PADDLE_WITH_XPU_BKCL)
+    member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_);
+
+    auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx(
+        global_scope, member_->places_);
+    auto &pool = platform::DeviceContextPool::Instance();
+    for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) {
+      auto *dev_ctx = static_cast<platform::XPUDeviceContext *>(
+          pool.Get(member_->places_[dev_id]));
+      auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]);
+      dev_ctx->set_bkcl_context(bkcl_ctx.comm());
+    }
+#else
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with XPU."));
+#endif
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
+    ir::Graph *graph, std::vector<ir::Graph *> *device_graphs,
+    const std::string &loss_var_name) {
+  auto device_count = member_->places_.size();
+  std::vector<ir::Graph *> async_graphs(device_count);
+
+  auto &graphs = *device_graphs;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->nccl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->nccl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
+  }
+#elif defined(PADDLE_WITH_XPU_BKCL)
+  if (member_->build_strategy_.async_mode_) {
+    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+                      platform::errors::PreconditionNotMet(
+                          "graphs.size() shoule be %d, but received %d",
+                          device_count, graphs.size()));
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_,
+        member_->bkcl_ctxs_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_,
+          member_->bkcl_ctxs_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
+  }
+#else
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use local async mode";
+    graph = member_->build_strategy_.Apply(
+        graph, {member_->places_[0]}, loss_var_name,
+        {member_->local_scopes_[0]}, 1, member_->use_device_);
+    for (size_t i = 1; i < device_count; ++i) {
+      graphs[i] = member_->build_strategy_.Apply(
+          graphs[i], {member_->places_[i]}, loss_var_name,
+          {member_->local_scopes_[i]}, 1, member_->use_device_);
+      async_graphs[i] = graphs[i];
+    }
+  } else {
+    graph = member_->build_strategy_.Apply(
+        graph, member_->places_, loss_var_name, member_->local_scopes_,
+        member_->nranks_, member_->use_device_);
+  }
+#endif
+
+  return async_graphs;
+}
+
+void ParallelExecutor::CreateVariableInfos(
+    std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
+  PADDLE_ENFORCE_EQ(
+      var_infos->size(), 0,
+      platform::errors::PreconditionNotMet(
+          "var_infos->size() shoule be 0, but received %d", var_infos->size()));
+  PADDLE_ENFORCE_EQ(
+      member_->is_persistable_.size(), 0,
+      platform::errors::PreconditionNotMet(
+          "member_->is_persistable_.size() shoule be 0, but received %d",
+          member_->is_persistable_.size()));
+  for (auto &node : graph->Nodes()) {
+    if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
+      var_infos->emplace_back();
+      var_infos->back().name_ = node->Var()->Name();
+      var_infos->back().type_ = node->Var()->GetType();
+      var_infos->back().persistable_ = node->Var()->Persistable();
+
+      member_->is_persistable_.emplace(node->Var()->Name(),
+                                       node->Var()->Persistable());
+    }
+  }
+
+  if (graph->Has(details::kFusedVars)) {
+    auto &fused_vars = graph->Get<details::FusedVars>(details::kFusedVars);
+    for (auto &fused_var : fused_vars) {
+      var_infos->emplace_back();
+      var_infos->back() = fused_var.second;
+
+      member_->is_persistable_.emplace(fused_var.first,
+                                       fused_var.second.persistable_);
+    }
+  }
+}
+
+std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
+    const ExecutionStrategy &exec_strategy,
+    std::vector<ir::Graph *> *async_graphs, ir::Graph *graph) {
+  std::vector<ir::Graph *> final_graphs;
+
+  if (member_->build_strategy_.async_mode_) {
+    VLOG(3) << "use AsyncSSAGraphExecutor";
+    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, *async_graphs));
+    final_graphs = *async_graphs;
+  } else if (member_->build_strategy_.enable_parallel_graph_) {
+    VLOG(3) << "use ParallelSSAGraphExecutor";
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    // TODO(Yancey1989): Remove passing in the main_program when
+    // allreduce_seq_pass doesn't need it as the attr.
+    bool is_inference = details::IsDataParallelInferenceGraph(*graph);
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+
+    auto *pg_exe = new details::ParallelSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+        member_->places_, graph);
+    final_graphs = pg_exe->Graphs();
+    member_->executor_.reset(pg_exe);
+
+    if (is_inference && member_->places_.size() > 1) {
+      member_->inference_executor_ = pg_exe;
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+    }
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "Paddle should be compiled with CUDA for ParallelGraph Execution."));
+#endif
+  } else {
+    bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
+    auto possible_inference_graphs =
+        details::TrySeparateToMultipleSingleDeviceGraphs(graph);
+    if (!possible_inference_graphs.empty()) {
+      VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
+      auto *pg_exe = new details::ParallelSSAGraphExecutor(
+          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+          member_->places_, std::move(possible_inference_graphs));
+      if (!has_drop_last_read_op) {
+        VLOG(5) << "Enable partial feed support in inference phase";
+        pg_exe->EnablePartialFeedSupport();
+      }
+      final_graphs = pg_exe->Graphs();
+      member_->executor_.reset(pg_exe);
+      member_->inference_executor_ = pg_exe;
+    } else {
+      LOG_IF(WARNING, details::HasKeepLastReadOp(*graph))
+          << "drop_last=False for DataLoader is not supported in training "
+             "network. It is automatically turned to drop_last=True.";
+      if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+        VLOG(3) << "use ThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      } else {
+        VLOG(3) << "use FastThreadedSSAGraphExecutor";
+        member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
+            member_->places_, graph));
+      }
+      final_graphs.emplace_back(graph);
+    }
+  }
+  return final_graphs;
+}
+
+void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs,
+    const std::unordered_map<Scope *, Scope *> &scope_map) {
+  PADDLE_ENFORCE_GE(
+      final_graphs.size(), 1,
+      platform::errors::PreconditionNotMet(
+          "final_graphs shoule contain at least one graph, but received %d",
+          final_graphs.size()));
+
+  PADDLE_ENFORCE_GT(scope_map.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "scope_map shoule contain at least one "
+                        "element, but received %d",
+                        scope_map.size()));
+  for (auto *g : final_graphs) {
+    auto ops = ir::FilterByNodeWrapper<details::OpHandleBase>(*g);
+    for (auto *op : ops) {
+      op->SetLocalExecScopes(scope_map);
+    }
+  }
+}
+
+void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs(
+    const std::vector<ir::Graph *> &final_graphs) {
+  if (final_graphs.size() == 1) {
+    ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size());
+  } else {
+    for (size_t i = 0; i < final_graphs.size(); ++i) {
+      ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i);
+    }
+  }
+}
+
 const ir::Graph &ParallelExecutor::Graph() const {
   return member_->executor_->Graph();
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 47de7dc48f4f2..d4d0b534b55f0 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -41,6 +42,7 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::VariableInfo;
 using details::BuildStrategy;
 using details::ExecutionStrategy;
 namespace p = paddle::platform;
@@ -93,6 +95,40 @@ class ParallelExecutor {
                                     const ExecutionStrategy &exec_strategy,
                                     const BuildStrategy &build_strategy) const;
 
+  void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy,
+                                     const BuildStrategy &build_strategy,
+                                     size_t device_count,
+                                     const ir::Graph &graph);
+
+  void CreateLocalScopes(Scope *global_scope,
+                         const std::vector<Scope *> &local_scopes,
+                         bool create_new);
+
+  std::unordered_map<Scope *, Scope *> CreateLocalExecScopes(
+      const std::vector<Scope *> &local_scopes, bool create_new);
+
+  std::vector<ir::Graph *> CloneGraphToMultiDevices(ir::Graph *graph);
+
+  void PrepareNCCLCommunicator(Scope *global_scope);
+
+  std::vector<ir::Graph *> CompileGraphWithBuildStrategy(
+      ir::Graph *graph, std::vector<ir::Graph *> *graphs,
+      const std::string &loss_var_name);
+
+  void CreateVariableInfos(std::vector<VariableInfo> *var_infos,
+                           ir::Graph *graph);
+
+  std::vector<ir::Graph *> CreateSSAGraphExecutor(
+      const ExecutionStrategy &exec_strategy,
+      std::vector<ir::Graph *> *async_graphs, ir::Graph *graph);
+
+  void ResetOpHandleScopeMapOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs,
+      const std::unordered_map<Scope *, Scope *> &scope_map);
+
+  void SetReaderOpDeviceInfoOfGraphs(
+      const std::vector<ir::Graph *> &final_graphs);
+
   ParallelExecutorPrivate *member_;
   std::vector<std::unique_ptr<ir::Graph>> async_graphs_;
 };
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 5968df548dfb0..cdd2dbd5b1d2d 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
+#if (defined PADDLE_WITH_NCCL)
   place_ = platform::CUDAPlace(place_id);
+#elif (defined WITH_ASCEND_CL)  // NOLINT
+  place_ = platform::NPUPlace(place_id);
+#endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
       trainer_desc.device_worker_name());
   auto this_worker =
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index e740771e5ca9f..7860b69313e7b 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 #include <float.h>
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc
index 4c30c40ad5837..7e48d0dc5f962 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
   TensorToStream(os, selected_rows.value(), dev_ctx);
 }
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  auto place = selected_rows.place();
+  dev_ctx = pool.Get(place);
+  SerializeToStream(os, selected_rows, *dev_ctx);
+}
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  dev_ctx = pool.Get(platform::CPUPlace());
+  DeserializeFromStream(os, selected_rows, *dev_ctx);
+}
+
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx) {
   {
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h
index 48353b43f56ca..e53e3d973c524 100644
--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
 void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
                            const platform::DeviceContext& dev_ctx);
 
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows);
+
+void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 7efb89ad7d9d9..01aa07e618464 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -332,7 +332,8 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(WITH_ASCEND_CL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc
index d2adbdd34512b..0f8465ab8948e 100644
--- a/paddle/fluid/framework/unused_var_check.cc
+++ b/paddle/fluid/framework/unused_var_check.cc
@@ -53,27 +53,28 @@ static const std::unordered_set<std::string> &GetOpWithUnusedVarAllowSet() {
   // Use pointer here for safe static deinitialization
   static auto *allow_set = new std::unordered_set<std::string>({
       // called once
-      "batch_norm",                      // 0
-      "batch_norm_grad",                 // 0
-      "sync_batch_norm",                 // 0
-      "sync_batch_norm_grad",            // 0
-      "inplace_abn",                     // 0
-      "inplace_abn_grad",                // 0
-      "dgc_momentum",                    // 0
-      "fake_quantize_range_abs_max",     // 0
-      "rmsprop",                         // 0
-      "sequence_conv_grad",              // 0
-      "roi_perspective_transform_grad",  // 0
-      "fill_zeros_like",                 // 1
-      "fill_any_like",                   // 1
-      "nce_grad",                        // 1
-      "precision_recall",                // 1
-      "fusion_seqpool_cvm_concat",       // 2
-      "fused_batch_norm_act",            // 2
-      "fused_batch_norm_act_grad",       // 2
-      "data_norm",                       // 0
-      "data_norm_grad",                  // 0
-      "update_loss_scaling",             // 0
+      "batch_norm",                         // 0
+      "batch_norm_grad",                    // 0
+      "sync_batch_norm",                    // 0
+      "sync_batch_norm_grad",               // 0
+      "inplace_abn",                        // 0
+      "inplace_abn_grad",                   // 0
+      "dgc_momentum",                       // 0
+      "fake_quantize_range_abs_max",        // 0
+      "rmsprop",                            // 0
+      "sequence_conv_grad",                 // 0
+      "roi_perspective_transform_grad",     // 0
+      "fill_zeros_like",                    // 1
+      "fill_any_like",                      // 1
+      "nce_grad",                           // 1
+      "precision_recall",                   // 1
+      "fusion_seqpool_cvm_concat",          // 2
+      "fused_batch_norm_act",               // 2
+      "fused_batch_norm_act_grad",          // 2
+      "data_norm",                          // 0
+      "data_norm_grad",                     // 0
+      "update_loss_scaling",                // 0
+      "fused_embedding_eltwise_layernorm",  // 0
   });
   return *allow_set;
 }
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index fc754cbaf177c..473df85aa0421 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -36,6 +36,11 @@
 #endif
 #endif
 
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -50,6 +55,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
     operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId, platform::BKCLCommunicator,
 #endif
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index a56458b21398b..fd2bb6e5c9952 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -26,7 +26,24 @@ class VarBase;
 
 AmpOperators::AmpOperators()
     : allow_ops_(new std::unordered_set<std::string>()),
-      block_ops_(new std::unordered_set<std::string>()) {}
+      block_ops_(new std::unordered_set<std::string>()),
+      unsupported_fp16_ops_(new std::unordered_set<std::string>()) {
+  auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
+  auto fp16_dtype = framework::proto::VarType::FP16;
+  for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) {
+    bool supported = false;
+    for (auto& kernel_type : it->second) {
+      if (platform::is_gpu_place(kernel_type.first.place_) &&
+          kernel_type.first.data_type_ == fp16_dtype) {
+        supported = true;
+      }
+    }
+    if (!supported) {
+      unsupported_fp16_ops_->insert(it->first);
+    }
+  }
+}
+
 AmpOperators::~AmpOperators() {}
 
 AmpOperators& AmpOperators::Instance() {
@@ -44,16 +61,26 @@ AmpOperators::GetMutableBlockOps() {
   return block_ops_;
 }
 
+std::shared_ptr<std::unordered_set<std::string>>
+AmpOperators::GetMutableUnsupportedFp16Ops() {
+  return unsupported_fp16_ops_;
+}
+
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops) {
   os << "allow ops: ";
   auto allow_ops = ops.GetMutableAllowOps();
   std::copy((*allow_ops).begin(), (*allow_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
-  os << "; ";
+  os << "\n";
   os << "block ops: ";
   auto block_ops = ops.GetMutableBlockOps();
   std::copy((*block_ops).begin(), (*block_ops).end(),
             std::ostream_iterator<std::string>(os, " "));
+  os << "\n";
+  os << "unsupported fp16 ops: ";
+  auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops();
+  std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(),
+            std::ostream_iterator<std::string>(os, " "));
   return os;
 }
 
@@ -156,6 +183,12 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type,
     return new_ins;
   } else {
     auto dst_type = GetPromoteType(ins);
+    // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32.
+    if (dst_type == framework::proto::VarType::FP16 &&
+        AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count(
+            op_type)) {
+      dst_type = framework::proto::VarType::FP32;
+    }
     for (auto& pair : new_ins) {
       // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16.
       if ((op_type == "batch_norm" || op_type == "layer_norm") &&
diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h
index 619c6b0baf896..fa76c19688a69 100644
--- a/paddle/fluid/imperative/amp_auto_cast.h
+++ b/paddle/fluid/imperative/amp_auto_cast.h
@@ -40,6 +40,9 @@ class AmpOperators {
 
   std::shared_ptr<std::unordered_set<std::string>> GetMutableBlockOps();
 
+  std::shared_ptr<std::unordered_set<std::string>>
+  GetMutableUnsupportedFp16Ops();
+
  private:
   AmpOperators();  // forbid calling default constructor
 
@@ -50,6 +53,9 @@ class AmpOperators {
   // The set of ops that support fp16 calculation and are considered numerically
   // dangerous and whose effects may also be observed in downstream ops.
   std::shared_ptr<std::unordered_set<std::string>> block_ops_;
+
+  // The set of ops that has no fp16 CUDA kennel.
+  std::shared_ptr<std::unordered_set<std::string>> unsupported_fp16_ops_;
 };
 
 std::ostream& operator<<(std::ostream& os, AmpOperators& ops);
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 70359dc3fd25b..a4af3117d3e32 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const {
 }
 
 void VarBase::ClearGradient() {
+  VLOG(4) << "ClearGradient " << Name();
   if (grad_var_) {
     if (grad_var_->Var().IsType<framework::SelectedRows>()) {
       auto* grad_t =
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 93fd85f13cbf0..c002c7a10cb7b 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -33,7 +33,7 @@ if (WITH_LITE)
   add_subdirectory(lite)
 endif()
 
-# fluid_modules exclude API-interface of inference/api and inference/capi
+# fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
@@ -61,7 +61,7 @@ if(NOT APPLE)
 endif()
 
 # C inference API
-add_subdirectory(capi)
+add_subdirectory(capi_exp)
 
 if(WITH_TESTING AND WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index bd27b1f5f3447..255c6ca75dfd7 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -213,6 +213,11 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
 
+  DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
+  DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int);
+  DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int);
+
   DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
                       std::vector<std::string>);
   DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector<std::string>);
@@ -222,6 +227,11 @@ struct Argument {
 
   DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool);
   DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int);
+  DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool);
+  DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string);
+  DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string);
+  DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index a4e263e2f464c..8407f98e6dfd9 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument,
       // run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
+    } else if (pass_name == "dlnne_subgraph_pass") {
+      pass->Set("min_subgraph_size",
+                new int(argument->dlnne_min_subgraph_size()));
+      pass->Set("program",
+                new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
       bool enable_int8 =
@@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->xpu_l3_workspace_size()));
       pass->Set("cpu_math_library_num_threads",
                 new int(argument->cpu_math_library_num_threads()));
+      pass->Set("locked", new bool(argument->xpu_locked()));
+      pass->Set("autotune", new bool(argument->xpu_autotune()));
+      pass->Set("autotune_file",
+                new std::string(argument->xpu_autotune_file()));
+      pass->Set("precision", new std::string(argument->xpu_precision()));
+      pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen()));
     }
     disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
index e35178428cc7b..330f7a9984734 100644
--- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -20,3 +20,15 @@ if (WITH_LITE)
   set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "")
   cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog)
 endif()
+
+MESSAGE("WITH_DLNNE:${WITH_DLNNE}")
+if(WITH_DLNNE)
+  cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util)
+  set(analysis_deps ${analysis_deps}
+        subgraph_util dlnne_subgraph_pass
+        CACHE INTERNAL "")
+
+  set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp)
+  file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n")
+  set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "")
+endif()
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
new file mode 100644
index 0000000000000..ae977c1403a87
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+namespace paddle {
+namespace inference {
+
+int RegisterPyFunc(const std::string& name, void* pfn);
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
new file mode 100644
index 0000000000000..8f789139af9bf
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <map>
+#include <set>
+
+#include <fstream>
+#include <iostream>
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/subgraph_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h"
+#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace inference {
+
+int (*PyConvertGraph)(const char *graph_name);
+
+int RegisterPyFunc(const std::string &name, void *pfn) {
+  if (name.compare("convert_graph") == 0) {
+    PyConvertGraph = reinterpret_cast<decltype(PyConvertGraph)>(pfn);
+  }
+
+  return 0;
+}
+int ConvertGraph(std::string graph_name) {
+  LOG(INFO) << "starting doing convert_graph";
+
+  PyConvertGraph(graph_name.c_str());
+
+  return 0;
+}
+
+namespace analysis {
+
+using framework::ir::Node;
+
+void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const {
+  static std::unordered_set<std::string> teller_set{
+      "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+      "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
+      "elementwise_add", "elementwise_mul", "dropout", "prelu",
+      "conv2d_transpose", "leaky_relu",
+      // "fc",
+      "shuffle_channel", "swish", "split",
+      // "instance_norm",
+      "gelu",
+      // "layer_norm",
+      // "scale",
+      // "stack",
+      "relu6", "reshape2", "transpose2", "concat", "slice",
+  };
+
+  framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph);
+
+  auto teller = [&](const framework::ir::Node *node) {
+    if (!node->IsOp() || !node->Op()) return false;
+    return teller_set.find(node->Op()->Type()) != teller_set.end();
+  };
+
+  framework::ir::SubGraphFuser fuser(
+      graph, teller, Get<int>("min_subgraph_size") /*min subgraph size*/,
+      "dlnne_engine");
+  fuser();
+
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in dlnne, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) {
+      CreateDlnneOp(node, graph, graph_param_names, &repetitive_params);
+
+      std::unordered_set<const Node *> nodes2remove(
+          framework::ir::Agent(node).subgraph()->begin(),
+          framework::ir::Agent(node).subgraph()->end());
+      framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+    }
+  }
+
+  std::unordered_set<const Node *> nodes2remove;
+  for (auto *node : graph->Nodes()) {
+    if (node->IsOp() && framework::ir::Agent(node).deleted()) {
+      nodes2remove.insert(node);
+    }
+  }
+  framework::ir::GraphSafeRemoveNodes(graph, nodes2remove);
+}
+
+std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
+  std::string engine_hash_key = "";
+  for (auto name : engine_inputs) {
+    engine_hash_key += name;
+  }
+  for (auto name : engine_outputs) {
+    engine_hash_key += name;
+  }
+  engine_hash_key += predictor_id;
+  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
+  return engine_key;
+}
+std::string replace_name(std::string name, const char *raw,
+                         const char *new_char) {
+  std::string r_name = name;
+  int pos = r_name.find(raw);
+  while (pos >= 0) {
+    r_name = r_name.replace(pos, 1, new_char);
+    pos = r_name.find(raw);
+  }
+  return r_name;
+}
+
+void DlnneSubgraphPass::CreateDlnneOp(
+    framework::ir::Node *node, framework::ir::Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
+  auto *op_desc = node->Op();
+  auto &subgraph = *framework::ir::Agent(node).subgraph();
+  PADDLE_ENFORCE_EQ(subgraph.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "The subgraph should not be empty."));
+
+  // A fake block desc.
+  framework::proto::BlockDesc block_proto;
+  framework::BlockDesc block_desc(nullptr, &block_proto);
+  block_desc.Proto()->set_parent_idx(-1);
+  block_desc.Proto()->set_idx(0);
+  LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
+  // for debug
+  framework::ProgramDesc tmp_dump_program_desc;
+  auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0);
+
+  std::unordered_map<std::string, framework::VarDesc *> name_var_desc;
+  std::set<std::string> name_var_input_nodes;
+  std::set<std::string> name_var_output_nodes;
+  std::set<std::string> name_ops;
+
+  for (auto *node : subgraph) {
+    auto *op = block_desc.AppendOp();
+    *op->Proto() = *node->Op()->Proto();
+
+    // debug
+    {
+      name_ops.insert(node->Name());
+      auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp();
+
+      framework::OpDesc op_desc;
+      op_desc.CopyFrom(*node->Op());
+
+      for (auto argument_name : op_desc.InputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      for (auto argument_name : op_desc.OutputArgumentNames()) {
+        if (std::count(graph_params.begin(), graph_params.end(),
+                       argument_name) > 0) {
+          op_desc.Rename(argument_name, replace_name(argument_name, "/", "."));
+        }
+      }
+      *tmp_dump_new_block_op->Proto() = *op_desc.Proto();
+
+      for (auto *x : node->inputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_input_nodes.insert(x->Name());
+      }
+
+      for (auto *x : node->outputs) {
+        if (x->IsVar()) {
+          name_var_desc[x->Name()] = x->Var();
+        }
+        if (std::count(graph_params.begin(), graph_params.end(), x->Name()) ==
+            0)
+          name_var_output_nodes.insert(x->Name());
+      }
+    }
+  }
+  std::set<std::string> valid_input_names;
+  std::set<std::string> valid_output_names;
+  for (auto name : name_var_output_nodes) {
+    if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) {
+      valid_output_names.insert(name);
+    }
+  }
+
+  for (auto name : name_var_input_nodes) {
+    if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+      valid_input_names.insert(name);
+    }
+  }
+
+  // Then, we will use the input_names_with_id and output_names_with_id to
+  // generate the engine key.
+  // So, We use set instead of unordered_set here to ensure that the engine key
+  // is unique.
+  std::set<std::string> input_names;
+  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+  // if we delete fluid copy of params shared by more than 1 ops, there will be
+  // problem, so we filter them out.
+
+  // The node->inputs contains input tensors and parameters.
+  for (auto *x : node->inputs) {
+    input_names.insert(x->Name());
+    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
+  }
+
+  std::set<std::string> output_names;
+  std::set<std::string> output_names_with_id;
+  std::vector<int> origin_output_dims;
+  for (auto *x : node->outputs) {
+    origin_output_dims.push_back(x->Var()->GetShape().size());
+    output_names.insert(x->Name());
+    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
+  }
+
+  std::unordered_map<std::string, std::string> output_name_map;
+  std::unordered_map<std::string, framework::ir::Node *> graph_var_map;
+
+  for (framework::ir::Node *node : graph->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      graph_var_map[node->Name()] = node;
+    }
+  }
+
+  // Set attrs
+  op_desc->SetType("dlnne_engine");
+  op_desc->SetInput("Xs", std::vector<std::string>(valid_input_names.begin(),
+                                                   valid_input_names.end()));
+
+  op_desc->SetOutput("Ys", std::vector<std::string>(valid_output_names.begin(),
+                                                    valid_output_names.end()));
+
+  op_desc->SetAttr("parameters", params);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));
+  op_desc->SetAttr("engine_key", engine_key);
+  auto *scope = param_scope();
+
+  {
+    std::set<std::string> input_names;
+
+    for (auto name : name_var_input_nodes) {
+      if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) {
+        input_names.insert(name);
+      }
+    }
+
+    // add feed to subgraph:
+    int input_idx = 0;
+    for (auto input_name : input_names) {
+      auto *feed0 = tmp_dump_main_block->AppendOp();
+      feed0->SetType("feed");
+      feed0->SetInput("X", {"feed"});
+      feed0->SetOutput("Out", {input_name});
+      feed0->SetAttr("col", input_idx);
+      input_idx++;
+    }
+    // add fetch to subgraph:
+    int output_idx = 0;
+    for (auto output_name : valid_output_names) {
+      auto *fetch0 = tmp_dump_main_block->AppendOp();
+      fetch0->SetType("fetch");
+      fetch0->SetInput("X", {output_name});
+      fetch0->SetOutput("Out", {"out"});
+      fetch0->SetAttr("col", output_idx);
+      output_idx++;
+    }
+
+    mkdir("./dump", 0777);
+    std::string dir_name = "./dump/" + engine_key;
+    mkdir(dir_name.c_str(), 0777);
+    ofstream m_stream;
+    m_stream.open(dir_name + "/__model__", ios::out);
+
+    VLOG(4) << "name_var_desc size:" << name_var_desc.size();
+
+    for (auto &kv : name_var_desc) {
+      auto *new_add_var = tmp_dump_main_block->Proto()->add_vars();
+      *new_add_var = *kv.second->Proto();
+      auto *variable_tmp = scope->FindVar(kv.first);
+      if (variable_tmp != nullptr) {
+        *new_add_var->mutable_name() = replace_name(kv.first, "/", ".");
+        new_add_var->set_persistable(true);
+      } else {
+        new_add_var->set_persistable(false);
+      }
+    }
+
+    for (auto param_name : params) {
+      auto *var = scope->FindVar(param_name);
+      if (var != nullptr) {
+        auto *var_t = var->GetMutable<framework::LoDTensor>();
+        ofstream p_stream;
+        p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."),
+                      ios::out);
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(var_t->place());
+        framework::SerializeToStream(p_stream, *var_t, dev_ctx);
+        p_stream.close();
+      }
+    }
+
+    std::string model;
+
+    tmp_dump_program_desc.Proto()->SerializeToString(&model);
+    m_stream << model;
+    m_stream.close();
+
+    op_desc->SetBlockAttr("sub_block", tmp_dump_main_block);
+    op_desc->SetAttr("subgraph", model);
+    op_desc->Flush();
+
+    ConvertGraph(engine_key);
+  }
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_PASS(dlnne_subgraph_pass,
+              paddle::inference::analysis::DlnneSubgraphPass);
diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
new file mode 100644
index 0000000000000..5a1d2506fdb09
--- /dev/null
+++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
+#include "paddle/fluid/inference/api/paddle_analysis_config.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+class Node;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+
+int ConvertGraph(std::string graph_name);
+
+namespace analysis {
+
+class DlnneSubgraphPass : public framework::ir::FusePassBase {
+ public:
+  void ApplyImpl(framework::ir::Graph *graph) const override;
+
+ private:
+  void CleanIntermediateOutputs(framework::ir::Node *node);
+  void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                     const std::vector<std::string> &graph_params,
+                     std::vector<std::string> *repetitive_params) const;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
index c697914904b3e..b8cac8992f4ee 100644
--- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc
@@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine(
   bool use_xpu = Get<bool>("use_xpu");
   int xpu_l3_workspace_size = Get<int>("xpu_l3_workspace_size");
   int cpu_math_library_num_threads = Get<int>("cpu_math_library_num_threads");
+  bool locked = Get<bool>("locked");
+  bool autotune = Get<bool>("autotune");
+  std::string autotune_file = Get<std::string>("autotune_file");
+  std::string precision = Get<std::string>("precision");
+  bool adaptive_seqlen = Get<bool>("adaptive_seqlen");
 
   lite_api::TargetType target_type;
   if (use_gpu) {
@@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine(
   };
   config.cpu_math_library_num_threads = cpu_math_library_num_threads;
   config.xpu_l3_workspace_size = xpu_l3_workspace_size;
+  config.locked = locked;
+  config.autotune = autotune;
+  config.autotune_file = autotune_file;
+  config.precision = precision;
+  config.adaptive_seqlen = adaptive_seqlen;
   if (dump_model) {
     lite::StrToBinaryFile("./model.bin", config.model);
     lite::StrToBinaryFile("./param.bin", config.param);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 60de4234b41a8..f57f07883dcd7 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 
 namespace paddle {
@@ -321,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     opt_input_shape = {};
   }
 
-  if (min_input_shape.size() > 0 && TRT_VERSION > 6000) {
+  auto to_major_version = [&](int full_version) -> float {
+    return (full_version / 100) / 10.0;
+  };
+  const float compile_time_trt_version = to_major_version(TRT_VERSION);
+  const float run_time_trt_version =
+      to_major_version(tensorrt::GetInferLibVersion());
+  if (compile_time_trt_version != run_time_trt_version) {
     LOG_FIRST_N(WARNING, 1)
-        << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, "
-        << "make sure the runtime TensorRT you are using is no less than this "
-           "version, otherwise, there might be Segfault!";
+        << "The Paddle Inference library is compiled with "
+        << compile_time_trt_version << " version TensorRT, "
+        << "but the runtime TensorRT you are using is " << run_time_trt_version
+        << " version. "
+           "This might cause serious compatibility issues. We strongly "
+           "recommend using the same TRT version at runtime.";
   }
 
   // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 03f86cc7ba6de..82c95ba2c9571 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -32,10 +32,10 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
 if(WITH_CRYPTO)
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto)
+              analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
 else()
     cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config zero_copy_tensor trainer_desc_proto)
+              analysis_config zero_copy_tensor trainer_desc_proto custom_operator)
 endif()
 
 if(WIN32)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0622fb27d9e38..853c1ac1da874 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -26,6 +26,7 @@ namespace paddle {
 struct MkldnnQuantizerConfig;
 
 extern const std::vector<std::string> kTRTSubgraphPasses;
+extern const std::vector<std::string> kDlnneSubgraphPasses;
 extern const std::vector<std::string> kLiteSubgraphPasses;
 
 PassStrategy *AnalysisConfig::pass_builder() const {
@@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() {
   Update();
 }
 
-void AnalysisConfig::EnableXpu(int l3_workspace_size) {
+void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked,
+                               bool autotune, const std::string &autotune_file,
+                               const std::string &precision,
+                               bool adaptive_seqlen) {
   use_xpu_ = true;
   xpu_l3_workspace_size_ = l3_workspace_size;
+  xpu_locked_ = locked;
+  xpu_autotune_ = autotune;
+  xpu_autotune_file_ = autotune_file;
+  xpu_precision_ = precision;
+  xpu_adaptive_seqlen_ = adaptive_seqlen;
   Update();
 }
 
@@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  // Dlnne related
+  CP_MEMBER(use_dlnne_);
+  CP_MEMBER(dlnne_min_subgraph_size_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(use_xpu_);
   CP_MEMBER(xpu_l3_workspace_size_);
+  CP_MEMBER(xpu_locked_);
+  CP_MEMBER(xpu_autotune_);
+  CP_MEMBER(xpu_autotune_file_);
+  CP_MEMBER(xpu_precision_);
+  CP_MEMBER(xpu_adaptive_seqlen_);
 
   // profile related.
   CP_MEMBER(with_profile_);
@@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
       pass_builder_->DeletePass(ps);
     }
   }
+  if (use_dlnne_) {
+    auto all_passes = kDlnneSubgraphPasses;
+    auto other_passes = other.pass_builder()->AllPasses();
+    // We should sort them, because the user may call the SwitchIrDebug
+    // interface, which will change the pass.
+    std::sort(all_passes.begin(), all_passes.end());
+    std::sort(other_passes.begin(), other_passes.end());
+    std::vector<std::string> deleted_passes;
+    std::set_difference(all_passes.begin(), all_passes.end(),
+                        other_passes.begin(), other_passes.end(),
+                        std::inserter(deleted_passes, deleted_passes.begin()));
+    for (auto ps : deleted_passes) {
+      pass_builder_->DeletePass(ps);
+    }
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine(
 #endif
 }
 
+void AnalysisConfig::EnableDlnne(int min_subgraph_size) {
+  use_dlnne_ = true;
+  dlnne_min_subgraph_size_ = min_subgraph_size;
+  Update();
+}
+
 void AnalysisConfig::SetTRTDynamicShapeInfo(
     std::map<std::string, std::vector<int>> min_input_shape,
     std::map<std::string, std::vector<int>> max_input_shape,
@@ -383,6 +421,14 @@ void AnalysisConfig::Update() {
       pass_builder()->AppendPass(pass);
     }
   }
+  LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl;
+  if (use_dlnne_) {
+    pass_builder()->ClearPasses();
+    for (const auto &pass : kDlnneSubgraphPasses) {
+      pass_builder()->AppendPass(pass);
+    }
+  }
+
   if (use_gpu() && use_cudnn_) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
@@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << tensorrt_max_batchsize_;
   ss << tensorrt_min_subgraph_size_;
 
+  ss << use_dlnne_;
+  ss << dlnne_min_subgraph_size_;
+
   for (auto &op : trt_disabled_ops_) ss << op.c_str();
   ss << ";";
 
@@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << use_lite_;
   ss << use_xpu_;
   ss << xpu_l3_workspace_size_;
+  ss << xpu_locked_;
+  ss << xpu_autotune_;
+  ss << xpu_autotune_file_;
+  ss << xpu_precision_;
+  ss << xpu_adaptive_seqlen_;
 
   ss << thread_local_stream_;
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 4b6c746d57525..6a6be14fd5977 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -537,6 +537,12 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
   }
 
+  if (config_.dlnne_enabled()) {
+    LOG(INFO) << "Dlnne subgraph is enabled";
+    argument_.SetUseDlnne(true);
+    argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_);
+  }
+
   if (config_.lite_engine_enabled()) {
     argument_.SetCpuMathLibraryNumThreads(
         config_.cpu_math_library_num_threads());
@@ -546,6 +552,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetLiteZeroCopy(config_.lite_zero_copy_);
     argument_.SetUseXpu(config_.use_xpu_);
     argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+    argument_.SetXpuLocked(config_.xpu_locked_);
+    argument_.SetXpuAutotune(config_.xpu_autotune_);
+    argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_);
+    argument_.SetXpuPrecision(config_.xpu_precision_);
+    argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
     LOG(INFO) << "Lite subgraph engine is enabled";
   }
 
@@ -617,7 +628,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   // This function can only be executed once per process.
   static std::once_flag custom_operators_registered;
   std::call_once(custom_operators_registered,
-                 []() { paddle::RegisterAllCustomOperator(); });
+                 []() { inference::RegisterAllCustomOperator(); });
 
   if (config.use_gpu()) {
     static std::once_flag gflags_initialized;
diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc
index 9cc491e10d691..d78560239de50 100644
--- a/paddle/fluid/inference/api/helper.cc
+++ b/paddle/fluid/inference/api/helper.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/extension/include/ext_op_meta_info.h"
+#include "paddle/fluid/framework/custom_operator.h"
+#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace inference {
@@ -40,5 +43,20 @@ std::string to_string<std::vector<std::vector<float>>>(
   return ss.str();
 }
 
+void RegisterAllCustomOperator() {
+  auto &op_meta_info_map = OpMetaInfoMap::Instance();
+  const auto &meta_info_map = op_meta_info_map.GetMap();
+  for (auto &pair : meta_info_map) {
+    const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()};
+    if (all_op_kernels.find(pair.first) == all_op_kernels.end()) {
+      framework::RegisterOperatorWithMetaInfo(pair.second);
+    } else {
+      LOG(INFO) << "The operator `" << pair.first
+                << "` has been registered. "
+                   "Therefore, we will not repeat the registration here.";
+    }
+  }
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 14b968f5834da..c6d25137594b7 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -398,5 +398,7 @@ static bool IsFileExists(const std::string &path) {
   return exists;
 }
 
+void RegisterAllCustomOperator();
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index e492b32cb6cbe..2bbd4bb837a22 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void DisableGpu();
 
-  void EnableXpu(int l3_workspace_size = 0xfffc00);
+  void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false,
+                 bool autotune = true, const std::string& autotune_file = "",
+                 const std::string& precision = "int16",
+                 bool adaptive_seqlen = false);
   ///
   /// \brief A boolean state telling whether the GPU is turned on.
   ///
@@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool tensorrt_dla_enabled() { return trt_use_dla_; }
 
+  void EnableDlnne(int min_subgraph_size = 3);
+  bool dlnne_enabled() const { return use_dlnne_; }
+
   ///
   /// \brief Turn on the usage of Lite sub-graph engine.
   ///
@@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig {
   std::vector<std::string> trt_disabled_ops_{};
   bool disable_trt_plugin_fp16_{false};
 
+  // dlnne related.
+  bool use_dlnne_{false};
+  int dlnne_min_subgraph_size_{3};
+
   // memory reuse related.
   bool enable_memory_optim_{false};
 
@@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig {
   bool thread_local_stream_{false};
   bool use_xpu_{false};
   int xpu_l3_workspace_size_;
+  bool xpu_locked_;
+  bool xpu_autotune_;
+  std::string xpu_autotune_file_;
+  std::string xpu_precision_;
+  bool xpu_adaptive_seqlen_;
 
   // mkldnn related.
   int mkldnn_cache_capacity_{0};
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1d77ddaf73ef7..2b7333edae0da 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -110,6 +110,15 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "transpose_flatten_concat_fuse_pass",
 });
 
+const std::vector<std::string> kDlnneSubgraphPasses({
+    "is_test_pass",                  //
+    "simplify_with_basic_ops_pass",  //
+    "conv_bn_fuse_pass",             //
+    "depthwise_conv_bn_fuse_pass",   //
+    "shuffle_channel_detect_pass",   //
+    "dlnne_subgraph_pass",           //
+});
+
 const std::vector<std::string> kLiteSubgraphPasses({
 #ifdef PADDLE_WITH_LITE
     "lite_subgraph_pass",
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index a725ebab35ead..d7556b50031b7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy {
 /// \brief List of tensorRT subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kTRTSubgraphPasses;
 
+/// \brief List of dlnne subgraph passes.
+PD_INFER_DECL extern const std::vector<std::string> kDlnneSubgraphPasses;
+
 /// \brief List of lite subgraph passes.
 PD_INFER_DECL extern const std::vector<std::string> kLiteSubgraphPasses;
 
diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc
index 231639667244d..9bb52ba578025 100644
--- a/paddle/fluid/inference/capi/pd_config.cc
+++ b/paddle/fluid/inference/capi/pd_config.cc
@@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) {
   return config->config.tensorrt_engine_enabled();
 }
 
+void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  config->config.EnableDlnne(min_subgraph_size);
+}
+
+bool PD_DlnneEnabled(const PD_AnalysisConfig* config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      config,
+      paddle::platform::errors::InvalidArgument(
+          "The pointer of analysis configuration shouldn't be nullptr"));
+  return config->config.dlnne_enabled();
+}
+
 void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) {
   PADDLE_ENFORCE_NOT_NULL(
       config,
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
new file mode 100644
index 0000000000000..521d24329d464
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc)
+
+cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference)
+
+if(NOT ON_INFER)
+    return()
+endif()
+
+# Create inference capi shared library
+cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference)
+set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c)
+if(WIN32)
+    target_link_libraries(paddle_inference_c_shared shlwapi.lib)
+endif()
diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc
new file mode 100644
index 0000000000000..2b049e992e71d
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/lod_demo.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file lod_demo.cc
+///
+/// \brief a demo for user to learn how to inference by c api.
+///  it rectify from
+///  paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+
+int main(int argc, char *argv[]) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  PD_PredictorRun(predictor);
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h
new file mode 100644
index 0000000000000..4b70ed7fbad29
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_common.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#if defined(_WIN32)
+#ifdef PADDLE_DLL_INFERENCE
+#define PADDLE_CAPI_EXPORT __declspec(dllexport)
+#else
+#define PADDLE_CAPI_EXPORT __declspec(dllimport)
+#endif  // PADDLE_DLL_INFERENCE
+#else
+#define PADDLE_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+
+///
+/// __pd_give means that a new object is returned. The user should make sure
+/// that the returned pointer is used exactly once as a value for an __pd_take
+/// argument. In between, it can be used as a value for as many __pd_keep
+/// arguments as the user likes.
+///
+#ifndef __pd_give
+#define __pd_give
+#endif
+///
+/// __pd_take means that the object the argument points to is taken over by the
+/// function and may no longer be used by the user as an argument to any other
+/// function. The pointer value must be one returned by a function returning an
+/// __pd_give pointer.
+///
+#ifndef __pd_take
+#define __pd_take
+#endif
+///
+/// __pd_keep means that the function will only use the object temporarily. The
+/// object which the argument points to is not taken over by the function. After
+/// the function has finished, the user can still use it as an argument to other
+/// functions.
+///
+#ifndef __pd_keep
+#define __pd_keep
+#endif
+
+typedef int8_t PD_Bool;
+#define TRUE 1
+#define FALSE 0
+
+#define PD_ENUM(type)   \
+  typedef int32_t type; \
+  enum
+
+PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8,
+                          PD_PRECISION_HALF};
+
+PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU,
+                      PD_PLACE_XPU};
+
+PD_ENUM(PD_DataType){
+    PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32,
+    PD_DATA_INT64,    PD_DATA_UINT8,
+};
diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc
new file mode 100644
index 0000000000000..c45454e86bdaa
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.cc
@@ -0,0 +1,382 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_config.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_NULL_POINTER_PARM(param)                  \
+  PADDLE_ENFORCE_NOT_NULL(                              \
+      param, paddle::platform::errors::InvalidArgument( \
+                 "The pointer of " #param " shouldn't be nullptr"))
+
+#define CHECK_AND_CONVERT_PD_CONFIG                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_config, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle config shouldn't be nullptr")); \
+  Config* config = reinterpret_cast<Config*>(pd_config)
+
+using paddle_infer::Config;
+
+static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) {
+  switch (precision) {
+    case PD_PRECISION_FLOAT32:
+      return Config::Precision::kFloat32;
+    case PD_PRECISION_INT8:
+      return Config::Precision::kInt8;
+    case PD_PRECISION_HALF:
+      return Config::Precision::kHalf;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle precision type %d.", precision));
+      return Config::Precision::kFloat32;
+  }
+}
+
+extern "C" {
+__pd_give PD_Config* PD_ConfigCreate() {
+  return reinterpret_cast<PD_Config*>(new Config());
+}
+
+void PD_ConfigDestroy(__pd_take PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  delete reinterpret_cast<Config*>(config);
+}
+
+void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                       const char* prog_file_path,
+                       const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetModel(prog_file_path, params_file_path);
+}
+void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config,
+                          const char* prog_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(prog_file_path);
+  config->SetProgFile(prog_file_path);
+}
+void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config,
+                            const char* params_file_path) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(params_file_path);
+  config->SetParamsFile(params_file_path);
+}
+void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config,
+                               const char* opt_cache_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(opt_cache_dir);
+  config->SetOptimCacheDir(opt_cache_dir);
+}
+
+void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config,
+                          const char* model_dir) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  CHECK_NULL_POINTER_PARM(model_dir);
+  config->SetModel(model_dir);
+}
+const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_dir().c_str();
+}
+const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->prog_file().c_str();
+}
+const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->params_file().c_str();
+}
+
+void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableFCPadding();
+}
+PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_fc_padding();
+}
+
+void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config,
+                           uint64_t memory_pool_init_size_mb,
+                           int32_t device_id) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableUseGpu(memory_pool_init_size_mb, device_id);
+}
+void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGpu();
+}
+PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_gpu();
+}
+
+void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config,
+                        int32_t l3_workspace_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableXpu(l3_workspace_size);
+}
+PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->use_xpu();
+}
+
+int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->gpu_device_id();
+}
+int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->xpu_device_id();
+}
+int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->memory_pool_init_size_mb();
+}
+float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->fraction_of_gpu_memory_for_pool();
+}
+void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableCUDNN();
+}
+PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cudnn_enabled();
+}
+
+void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrOptim(x);
+}
+PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->ir_optim();
+}
+
+void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config,
+                                   int32_t workspace_size,
+                                   int32_t max_batch_size,
+                                   int32_t min_subgraph_size,
+                                   PD_PrecisionType precision,
+                                   PD_Bool use_static, PD_Bool use_calib_mode) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtEngine(
+      workspace_size, max_batch_size, min_subgraph_size,
+      ConvertToCxxPrecisionType(precision), use_static, use_calib_mode);
+}
+PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_engine_enabled();
+}
+
+void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config,
+                                     size_t tensor_num,
+                                     const char** tensor_name,
+                                     size_t* shapes_num, int32_t** min_shape,
+                                     int32_t** max_shape, int32_t** optim_shape,
+                                     PD_Bool disable_trt_plugin_fp16) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::map<std::string, std::vector<int>> min_input_shapes;
+  std::map<std::string, std::vector<int>> max_input_shapes;
+  std::map<std::string, std::vector<int>> optim_input_shapes;
+  for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) {
+    std::string name(tensor_name[tensor_index]);
+    std::vector<int> min_input_shape, max_input_shape, optim_input_shape;
+    for (size_t shape_index = 0; shape_index < shapes_num[tensor_index];
+         ++shape_index) {
+      min_input_shape.emplace_back(min_shape[tensor_index][shape_index]);
+      max_input_shape.emplace_back(max_shape[tensor_index][shape_index]);
+      optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]);
+    }
+    min_input_shapes[name] = std::move(min_input_shape);
+    max_input_shapes[name] = std::move(max_input_shape);
+    optim_input_shapes[name] = std::move(optim_input_shape);
+  }
+  config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes,
+                                 optim_input_shapes, disable_trt_plugin_fp16);
+}
+
+void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num,
+                                 const char** ops_name) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> ops_list;
+  for (size_t index = 0; index < ops_num; ++index) {
+    ops_list.emplace_back(ops_name[index]);
+  }
+  config->Exp_DisableTensorRtOPs(ops_list);
+}
+
+void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtOSS();
+}
+PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_oss_enabled();
+}
+
+void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config,
+                                int32_t dla_core) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableTensorRtDLA(dla_core);
+}
+PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->tensorrt_dla_enabled();
+}
+
+void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config,
+                               PD_PrecisionType precision, PD_Bool zero_copy,
+                               size_t passes_filter_num,
+                               const char** passes_filter,
+                               size_t ops_filter_num, const char** ops_filter) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::vector<std::string> passes_filters, ops_filters;
+  for (size_t index = 0; index < passes_filter_num; ++index) {
+    passes_filters.emplace_back(passes_filter[index]);
+  }
+  for (size_t index = 0; index < ops_filter_num; ++index) {
+    ops_filters.emplace_back(ops_filter[index]);
+  }
+  config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy,
+                           passes_filters, ops_filters);
+}
+PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->lite_engine_enabled();
+}
+
+void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SwitchIrDebug(x);
+}
+void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMKLDNN();
+}
+void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config,
+                                     int32_t capacity) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetMkldnnCacheCapacity(capacity);
+}
+PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_enabled();
+}
+void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads);
+}
+int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->cpu_math_library_num_threads();
+}
+
+void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num,
+                          const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetMKLDNNOp(std::move(op_names));
+}
+void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnQuantizer();
+}
+void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMkldnnBfloat16();
+}
+PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_bfloat16_enabled();
+}
+void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num,
+                            const char** op_list) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  std::unordered_set<std::string> op_names;
+  for (size_t index = 0; index < ops_num; ++index) {
+    op_names.emplace(op_list[index]);
+  }
+  config->SetBfloat16Op(std::move(op_names));
+}
+PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->thread_local_stream_enabled();
+}
+PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->mkldnn_quantizer_enabled();
+}
+void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config,
+                             const char* prog_buffer, size_t prog_buffer_size,
+                             const char* params_buffer,
+                             size_t params_buffer_size) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer,
+                         params_buffer_size);
+}
+PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->model_from_memory();
+}
+void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableMemoryOptim();
+}
+PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->enable_memory_optim();
+}
+void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableProfile();
+}
+PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->profile_enabled();
+}
+void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->DisableGlogInfo();
+}
+PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->glog_info_disabled();
+}
+void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->SetInValid();
+}
+PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  return config->is_valid();
+}
+void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->EnableGpuMultiStream();
+}
+void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) {
+  CHECK_AND_CONVERT_PD_CONFIG;
+  config->PartiallyRelease();
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h
new file mode 100644
index 0000000000000..e44983e24484e
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_config.h
@@ -0,0 +1,571 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_config.h
+///
+/// \brief interface for paddle config
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Config PD_Config;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a paddle config
+///
+/// \return new config.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate();
+///
+/// \brief Destroy the paddle config
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config);
+///
+/// \brief Set the combined model with two specific pathes for program and
+/// parameters.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path of the combined model.
+/// \param[in] params_file_path params file path of the combined model.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config,
+                                                 const char* prog_file_path,
+                                                 const char* params_file_path);
+///
+/// \brief Set the model file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_file_path model file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile(
+    __pd_keep PD_Config* pd_config, const char* prog_file_path);
+///
+/// \brief Set the params file path of a combined model.
+///
+/// \param[in] pd_onfig config
+/// \param[in] params_file_path params file path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile(
+    __pd_keep PD_Config* pd_config, const char* params_file_path);
+///
+/// \brief Set the path of optimization cache directory.
+/// \param[in] pd_onfig config
+/// \param[in] opt_cache_dir the path of optimization cache directory.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir(
+    __pd_keep PD_Config* pd_config, const char* opt_cache_dir);
+///
+/// \brief Set the no-combined model dir path.
+/// \param[in] pd_onfig config
+/// \param[in] model_dir model dir path.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir(
+    __pd_keep PD_Config* pd_config, const char* model_dir);
+///
+/// \brief Get the model directory path.
+///
+/// \param[in] pd_onfig config
+/// \return The model directory path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the program file path.
+///
+/// \param[in] pd_onfig config
+/// \return The program file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the params file path.
+///
+/// \param[in] pd_onfig config
+/// \return The params file path.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn off FC Padding.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether fc padding is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether fc padding is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on GPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in
+/// MB.
+/// \param[in] device_id device_id the GPU card to use.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu(
+    __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb,
+    int32_t device_id);
+///
+/// \brief Turn off GPU.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the GPU is turned on.
+///
+/// \brief Turn off GPU.
+/// \return Whether the GPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on XPU.
+///
+/// \param[in] pd_onfig config
+/// \param[in] l3_workspace_size l3 workspace size.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu(
+    __pd_keep PD_Config* pd_config, int32_t l3_workspace_size);
+///
+/// \brief A boolean state telling whether the XPU is turned on.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the XPU is turned on.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the GPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The GPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the XPU device id.
+///
+/// \param[in] pd_onfig config
+/// \return The XPU device id.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the initial size in MB of the GPU memory pool.
+///
+/// \param[in] pd_onfig config
+/// \return The initial size in MB of the GPU memory pool.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Get the proportion of the initial memory pool size compared to the
+/// device.
+///
+/// \param[in] pd_onfig config
+/// \return The proportion of the initial memory pool size.
+///
+PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on CUDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use CUDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use CUDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to perform IR graph optimization.
+/// If turned off, the AnalysisConfig will act just like a NativeConfig.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x Whether the ir graph optimization is actived.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief A boolean state telling whether the ir graph optimization is
+/// actived.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use ir graph optimization.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the TensorRT engine.
+/// The TensorRT engine will accelerate some subgraphes in the original Fluid
+/// computation graph. In some models such as resnet50, GoogleNet and so on,
+/// it gains significant performance acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] workspace_size The memory size(in byte) used for TensorRT
+/// workspace.
+/// \param[in] max_batch_size The maximum batch size of this prediction task,
+/// better set as small as possible for less performance loss.
+/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a
+/// subgraph is smaller than this, it will not be transferred to TensorRT
+/// engine.
+/// \param[in] precision The precision used in TensorRT.
+/// \param[in] use_static Serialize optimization information to disk for
+/// reusing.
+/// \param[in] use_calib_mode Use TRT int8 calibration(post training
+/// quantization).
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine(
+    __pd_keep PD_Config* pd_config, int32_t workspace_size,
+    int32_t max_batch_size, int32_t min_subgraph_size,
+    PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode);
+///
+/// \brief A boolean state telling whether the TensorRT engine is used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the TensorRT engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode.
+///
+/// \param[in] pd_onfig config
+/// \param[in] tensor_num The number of the subgraph input.
+/// \param[in] tensor_name The name of every subgraph input.
+/// \param[in] shapes_num The shape size of every subgraph input.
+/// \param[in] min_shape The min input shape of every subgraph input.
+/// \param[in] max_shape The max input shape of every subgraph input.
+/// \param[in] optim_shape The opt input shape of every subgraph input.
+/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that
+/// TRT plugin will not run fp16.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo(
+    __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name,
+    size_t* shapes_num, int32_t** min_shape, int32_t** max_shape,
+    int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16);
+///
+/// \brief Prevent ops running in Paddle-TRT
+/// NOTE: just experimental, not an official stable API, easy to be broken.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num ops number
+/// \param[in] ops_name ops name
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name);
+///
+/// \brief Replace some TensorRT plugins to TensorRT OSS(
+/// https://github.com/NVIDIA/TensorRT), with which some models's inference
+/// may be more high-performance. Libnvinfer_plugin.so greater than
+/// V7.2.1 is needed.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the TensorRT OSS.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT OSS.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Enable TensorRT DLA
+///
+/// \param[in] pd_onfig config
+/// \param[in] dla_core ID of DLACore, which should be 0, 1,
+///        ..., IBuilder.getNbDLACores() - 1
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla(
+    __pd_keep PD_Config* pd_config, int32_t dla_core);
+///
+/// \brief A boolean state telling whether to use the TensorRT DLA.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the TensorRT DLA.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on the usage of Lite sub-graph engine.
+///
+/// \param[in] pd_onfig config
+/// \param[in] precision Precion used in Lite sub-graph engine.
+/// \param[in] zero_copy whether use zero copy.
+/// \param[in] passes_filter_num The number of passes used in Lite sub-graph
+/// engine.
+/// \param[in] passes_filter The name of passes used in Lite sub-graph engine.
+/// \param[in] ops_filter_num The number of operators not supported by Lite.
+/// \param[in] ops_filter The name of operators not supported by Lite.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine(
+    __pd_keep PD_Config* pd_config, PD_PrecisionType precision,
+    PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter,
+    size_t ops_filter_num, const char** ops_filter);
+///
+/// \brief A boolean state indicating whether the Lite sub-graph engine is
+/// used.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Lite sub-graph engine is used.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Control whether to debug IR graph analysis phase.
+/// This will generate DOT files for visualizing the computation graph after
+/// each analysis pass applied.
+///
+/// \param[in] pd_onfig config
+/// \param[in] x whether to debug IR graph analysis phase.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug(
+    __pd_keep PD_Config* pd_config, PD_Bool x);
+///
+/// \brief Turn on MKLDNN.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the cache capacity of different input shapes for MKLDNN.
+/// Default value 0 means not caching any shape.
+/// Please see MKL-DNN Data Caching Design Document:
+/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md
+///
+/// \param[in] pd_onfig config
+/// \param[in] capacity The cache capacity.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity(
+    __pd_keep PD_Config* pd_config, int32_t capacity);
+///
+/// \brief A boolean state telling whether to use the MKLDNN.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the number of cpu math library threads.
+///
+/// \param[in] pd_onfig config
+/// \param cpu_math_library_num_threads The number of cpu math library
+/// threads.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads);
+///
+/// \brief An int state telling how many threads are used in the CPU math
+/// library.
+///
+/// \param[in] pd_onfig config
+/// \return The number of threads used in the CPU math library.
+///
+PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the operator type list to use MKLDNN acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Turn on MKLDNN quantization.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the MKLDNN quantization is enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the MKLDNN quantization is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on MKLDNN bfloat16.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether to use the MKLDNN Bfloat16.
+///
+/// \param[in] pd_onfig config
+/// \return Whether to use the MKLDNN Bfloat16.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled(
+    __pd_keep PD_Config* pd_config);
+/// \brief Specify the operator type list to use Bfloat16 acceleration.
+///
+/// \param[in] pd_onfig config
+/// \param[in] ops_num The number of operator type list.
+/// \param[in] op_list The name of operator type list.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op(
+    __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list);
+///
+/// \brief Enable the GPU multi-computing stream feature.
+/// NOTE: The current behavior of this interface is to bind the computation
+/// stream to the thread, and this behavior may be changed in the future.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the thread local CUDA stream is
+/// enabled.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the thread local CUDA stream is enabled.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Specify the memory buffer of program and parameter.
+/// Used when model and params are loaded directly from memory.
+///
+/// \param[in] pd_onfig config
+/// \param[in] prog_buffer The memory buffer of program.
+/// \param[in] prog_buffer_size The size of the model data.
+/// \param[in] params_buffer The memory buffer of the combined parameters file.
+/// \param[in] params_buffer_size The size of the combined parameters data.
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer(
+    __pd_keep PD_Config* pd_config, const char* prog_buffer,
+    size_t prog_buffer_size, const char* params_buffer,
+    size_t params_buffer_size);
+///
+/// \brief A boolean state telling whether the model is set from the CPU
+/// memory.
+///
+/// \param[in] pd_onfig config
+/// \return Whether model and params are loaded directly from memory.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on memory optimize
+/// NOTE still in development.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the memory optimization is
+/// activated.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the memory optimization is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Turn on profiling report.
+/// If not turned on, no profiling report will be generated.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the profiler is activated.
+///
+/// \param[in] pd_onfig config
+/// \return bool Whether the profiler is activated.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Mute all logs in Paddle inference.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether logs in Paddle inference are muted.
+///
+/// \param[in] pd_onfig config
+/// \return Whether logs in Paddle inference are muted.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Set the Config to be invalid.
+/// This is to ensure that an Config can only be used in one
+/// Predictor.
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief A boolean state telling whether the Config is valid.
+///
+/// \param[in] pd_onfig config
+/// \return Whether the Config is valid.
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid(
+    __pd_keep PD_Config* pd_config);
+///
+/// \brief Partially release the memory
+///
+/// \param[in] pd_onfig config
+///
+PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease(
+    __pd_keep PD_Config* pd_config);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_inference_api.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h
new file mode 100644
index 0000000000000..5f21dca1a7bf6
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h
@@ -0,0 +1,22 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pd_common.h"     // NOLINT
+#include "pd_config.h"     // NOLINT
+#include "pd_predictor.h"  // NOLINT
+#include "pd_tensor.h"     // NOLINT
+#include "pd_types.h"      // NOLINT
+#include "pd_utils.h"      // NOLINT
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc
new file mode 100644
index 0000000000000..f5287a5152957
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_predictor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_PREDICTOR                              \
+  PADDLE_ENFORCE_NOT_NULL(                                          \
+      pd_predictor,                                                 \
+      paddle::platform::errors::InvalidArgument(                    \
+          "The pointer of paddle predictor shouldn't be nullptr")); \
+  auto& predictor = pd_predictor->predictor
+
+extern "C" {
+__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      pd_config, paddle::platform::errors::InvalidArgument(
+                     "The pointer of paddle predictor shouldn't be nullptr"));
+  PD_Predictor* pd_predictor = new PD_Predictor();
+  paddle_infer::Config* config =
+      reinterpret_cast<paddle_infer::Config*>(pd_config);
+  pd_predictor->predictor = paddle_infer::CreatePredictor(*config);
+  delete config;
+  return pd_predictor;
+}
+
+__pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Predictor* new_predictor = new PD_Predictor();
+  new_predictor->predictor = predictor->Clone();
+  return new_predictor;
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetInputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames(
+    __pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  std::vector<std::string> names = predictor->GetOutputNames();
+  return paddle_infer::CvtVecToOneDimArrayCstr(names);
+}
+
+size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetInputNames().size();
+}
+
+size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->GetOutputNames().size();
+}
+__pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetInputHandle(name);
+  return pd_tensor;
+}
+
+__pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  PD_Tensor* pd_tensor = new PD_Tensor();
+  pd_tensor->tensor = predictor->GetOutputHandle(name);
+  return pd_tensor;
+}
+
+PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->Run();
+}
+
+void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  predictor->ClearIntermediateTensor();
+}
+
+uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) {
+  CHECK_AND_CONVERT_PD_PREDICTOR;
+  return predictor->TryShrinkMemory();
+}
+
+void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) {
+  delete pd_predictor;
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h
new file mode 100644
index 0000000000000..d4542d0b6d394
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_predictor.h
@@ -0,0 +1,148 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_predictor.h
+///
+/// \brief interface for paddle predictor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Predictor PD_Predictor;
+typedef struct PD_Config PD_Config;
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Create a new Predictor
+///
+/// \param[in] Config config
+/// \return new predicor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate(
+    __pd_take PD_Config* pd_config);
+///
+/// \brief Clone a new Predictor
+///
+/// \param[in] pd_predictor predictor
+/// \return new predictor.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone(
+    __pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the input names
+///
+/// \param[in] pd_predictor predictor
+/// \return input names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor);
+///
+/// \brief Get the output names
+///
+/// \param[in] pd_predictor predictor
+/// \return output names
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr*
+PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the input number
+///
+/// \param[in] pd_predictor predictor
+/// \return input number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the output number
+///
+/// \param[in] pd_predictor predictor
+/// \return output number
+///
+PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Get the Input Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name input name
+/// \return input tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Get the Output Tensor object
+///
+/// \param[in] pd_predictor predictor
+/// \param[in] name output name
+/// \return output tensor
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle(
+    __pd_keep PD_Predictor* pd_predictor, const char* name);
+
+///
+/// \brief Run the prediction engine
+///
+/// \param[in] pd_predictor predictor
+/// \return Whether the function executed successfully
+///
+PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun(
+    __pd_keep PD_Predictor* pd_predictor);
+
+/// \brief Clear the intermediate tensors of the predictor
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Release all tmp tensor to compress the size of the memory pool.
+/// The memory pool is considered to be composed of a list of chunks, if
+/// the chunk is not occupied, it can be released.
+///
+/// \param[in] pd_predictor predictor
+/// \return Number of bytes released. It may be smaller than the actual
+/// released memory, because part of the memory is not managed by the
+/// MemoryPool.
+///
+PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory(
+    __pd_keep PD_Predictor* pd_predictor);
+
+///
+/// \brief Destroy a predictor object
+///
+/// \param[in] pd_predictor predictor
+///
+PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy(
+    __pd_take PD_Predictor* pd_predictor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc
new file mode 100644
index 0000000000000..9c661dea6f2bb
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/capi_exp/pd_tensor.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/types_internal.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define CHECK_AND_CONVERT_PD_TENSOR                                         \
+  PADDLE_ENFORCE_NOT_NULL(                                                  \
+      pd_tensor, paddle::platform::errors::InvalidArgument(                 \
+                     "The pointer of paddle tensor shouldn't be nullptr")); \
+  auto& tensor = pd_tensor->tensor
+
+extern "C" {
+
+void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; }
+void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size,
+                      int32_t* shape) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  std::vector<int> shapes(shape_size);
+  for (size_t index = 0; index < shape_size; ++index) {
+    shapes[index] = shape[index];
+  }
+  tensor->Reshape(shapes);
+}
+
+#define REPEAT_ALL_DATA_TYPE(func)                             \
+  func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \
+      func(uint8_t, Uint8) func(int8_t, Int8)
+
+#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type)                                \
+  type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor,             \
+                                   PD_PlaceType place) {                       \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    return tensor->mutable_data<type>(paddle_infer::CvtToCxxPlaceType(place)); \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL)
+#undef PD_TENSOR_MUTABLE_DATA_IMPL
+
+#define PD_TENSOR_DATA_IMPL(type, Type)                                        \
+  type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor,                    \
+                            PD_PlaceType* place, int32_t* size) {              \
+    CHECK_AND_CONVERT_PD_TENSOR;                                               \
+    PADDLE_ENFORCE_NOT_NULL(place,                                             \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of place shouldn't be nullptr")); \
+    PADDLE_ENFORCE_NOT_NULL(size,                                              \
+                            paddle::platform::errors::InvalidArgument(         \
+                                "The pointer of size shouldn't be nullptr"));  \
+    paddle_infer::PlaceType cxx_palce_type;                                    \
+    int cxx_size;                                                              \
+    type* data = tensor->data<type>(&cxx_palce_type, &cxx_size);               \
+    *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type);                \
+    *size = static_cast<int32_t>(cxx_size);                                    \
+    return data;                                                               \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL)
+#undef PD_TENSOR_DATA_IMPL
+
+#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type)                  \
+  void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \
+                                  const type* data) {             \
+    CHECK_AND_CONVERT_PD_TENSOR;                                  \
+    tensor->CopyFromCpu<type>(data);                              \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL)
+#undef PD_TENSOR_COPY_FROM_CPU_IMPL
+
+#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type)                                \
+  void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \
+    CHECK_AND_CONVERT_PD_TENSOR;                                              \
+    tensor->CopyToCpu<type>(data);                                            \
+  }
+REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL)
+#undef PD_TENSOR_COPY_TO_CPU_IMPL
+
+#undef REPEAT_ALL_DATA_TYPE
+
+__pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape());
+}
+void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor,
+                     __pd_keep PD_TwoDimArraySize* lod) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod));
+}
+__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod());
+}
+const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return tensor->name().c_str();
+}
+PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) {
+  CHECK_AND_CONVERT_PD_TENSOR;
+  return paddle_infer::CvtFromCxxDatatype(tensor->type());
+}
+
+}  // extern "C"
diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h
new file mode 100644
index 0000000000000..29ea4b5d62e43
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_tensor.h
@@ -0,0 +1,287 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_tensor.h
+///
+/// \brief interface for paddle tensor
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_Tensor PD_Tensor;
+typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32;
+typedef struct PD_TwoDimArraySize PD_TwoDimArraySize;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the paddle tensor
+///
+/// \param[in] pd_tensor tensor
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor);
+
+///
+/// \brief Reset the shape of the tensor.
+/// Generally it's only used for the input tensor.
+/// Reshape must be called before calling PD_TensorMutableData*() or
+/// PD_TensorCopyFromCpu*()
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] shape_size The size of shape.
+/// \param[in] shape The shape to set.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor,
+                                                size_t shape_size,
+                                                int32_t* shape);
+
+///
+/// \brief Get the memory pointer in CPU or GPU with 'float' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type.
+/// Please Reshape the tensor first before call this.
+/// It's usually used to get input data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[in] place The place of the tensor.
+/// \return Memory pointer of pd_tensor
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Get the memory pointer directly.
+/// It's usually used to get the output data pointer.
+///
+/// \param[in] pd_tensor tensor.
+/// \param[out] place To get the device type of the tensor.
+/// \param[out] size To get the data size of the tensor.
+/// \return The tensor data buffer pointer.
+///
+PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8(
+    __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, const float* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, const int64_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, const int32_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, const uint8_t* data);
+///
+/// \brief Copy the host memory to tensor data.
+/// It's usually used to set the input tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[in] data The pointer of the data, from which the tensor will copy.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, const int8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat(
+    __pd_keep PD_Tensor* pd_tensor, float* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64(
+    __pd_keep PD_Tensor* pd_tensor, int64_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32(
+    __pd_keep PD_Tensor* pd_tensor, int32_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8(
+    __pd_keep PD_Tensor* pd_tensor, uint8_t* data);
+///
+/// \brief Copy the tensor data to the host memory.
+/// It's usually used to get the output tensor data.
+/// \param[in] pd_tensor tensor.
+/// \param[out] data The tensor will copy the data to the address.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8(
+    __pd_keep PD_Tensor* pd_tensor, int8_t* data);
+///
+/// \brief Get the tensor shape
+/// \param[in] pd_tensor tensor.
+/// \return The tensor shape.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape(
+    __pd_keep PD_Tensor* pd_tensor);
+
+///
+/// \brief Set the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \param[in] lod lod information.
+///
+PADDLE_CAPI_EXPORT extern void PD_TensorSetLod(
+    __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod);
+///
+/// \brief Get the tensor lod information
+/// \param[in] pd_tensor tensor.
+/// \return the lod information.
+///
+PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor name
+/// \param[in] pd_tensor tensor.
+/// \return the tensor name.
+///
+PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName(
+    __pd_keep PD_Tensor* pd_tensor);
+///
+/// \brief Get the tensor data type
+/// \param[in] pd_tensor tensor.
+/// \return the tensor data type.
+///
+PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType(
+    __pd_keep PD_Tensor* pd_tensor);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h
new file mode 100644
index 0000000000000..a5da2913a9b20
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_types.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_common.h"  // NOLINT
+
+typedef struct PD_OneDimArrayInt32 {
+  size_t size;
+  int32_t* data;
+} PD_OneDimArrayInt32;  // std::vector<int32_t>
+
+typedef struct PD_OneDimArraySize {
+  size_t size;
+  size_t* data;
+} PD_OneDimArraySize;  // std::vector<size_t>
+
+typedef struct PD_OneDimArrayCstr {
+  size_t size;
+  char** data;
+} PD_OneDimArrayCstr;  // std::vector<std::string>
+
+typedef struct PD_TwoDimArraySize {
+  size_t size;
+  PD_OneDimArraySize** data;
+} PD_TwoDimArraySize;  // std::vector<std::vector<size_t>>
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc
new file mode 100644
index 0000000000000..2e762619f5567
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_utils.h"
+#include "paddle/fluid/inference/capi_exp/utils_internal.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#define DESTROY_ONE_DIM_ARRAY(type)                                           \
+  void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)   \
+  __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \
+      const std::vector<vec_type>& vec) {                    \
+    PD_OneDimArray##Type* array = new PD_OneDimArray##Type;  \
+    array->size = vec.size();                                \
+    array->data = vec.empty() ? NULL : new type[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {    \
+      array->data[index] = vec[index];                       \
+    }                                                        \
+    return array;                                            \
+  }
+#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)   \
+  std::vector<vec_type> CvtOneDimArrayToVec##Type(           \
+      __pd_keep const PD_OneDimArray##Type* array) {         \
+    std::vector<vec_type> vec;                               \
+    if (array != NULL) {                                     \
+      vec.resize(array->size);                               \
+      for (size_t index = 0; index < array->size; ++index) { \
+        vec[index] = array->data[index];                     \
+      }                                                      \
+    }                                                        \
+    return vec;                                              \
+  }
+
+#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_ONE_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int)
+ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_ONE_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_ONE_DIM_ARRAY
+#undef DESTROY_ONE_DIM_ARRAY
+
+void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) {
+  if (array != NULL) {
+    if (array->size != 0) {
+      for (size_t index = 0; index < array->size; ++index) {
+        delete[] array->data[index];
+      }
+    }
+    delete[] array->data;
+    delete array;
+  }
+}
+namespace paddle_infer {
+
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec) {
+  PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr;
+  array->size = vec.size();
+  array->data = vec.empty() ? NULL : new char*[vec.size()];
+  for (size_t index = 0u; index < vec.size(); ++index) {
+    array->data[index] = new char[vec[index].size() + 1];
+    memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1);
+  }
+  return array;
+}
+
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array) {
+  std::vector<std::string> vec;
+  for (size_t index = 0; index < array->size; ++index) {
+    vec.emplace_back(array->data[index]);
+  }
+  return vec;
+}
+
+}  // namespace paddle_infer
+
+#define DESTROY_TWO_DIM_ARRAY(type)                                           \
+  void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \
+    if (array != NULL) {                                                      \
+      if (array->size != 0) {                                                 \
+        for (size_t index = 0; index < array->size; ++index) {                \
+          PD_OneDimArray##type##Destroy(array->data[index]);                  \
+        }                                                                     \
+      }                                                                       \
+      delete[] array->data;                                                   \
+      delete array;                                                           \
+    }                                                                         \
+  }
+#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)                    \
+  __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type(                  \
+      const std::vector<std::vector<vec_type>>& vec) {                        \
+    PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type;                   \
+    array->size = vec.size();                                                 \
+    array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \
+    for (size_t index = 0; index < vec.size(); ++index) {                     \
+      array->data[index] = CvtVecToOneDimArray##Type(vec[index]);             \
+    }                                                                         \
+    return array;                                                             \
+  }
+#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)            \
+  std::vector<std::vector<vec_type>> CvtTwoDimArrayToVec##Type(       \
+      __pd_keep const PD_TwoDimArray##Type* array) {                  \
+    std::vector<std::vector<vec_type>> vec;                           \
+    if (array != NULL && array->size != 0) {                          \
+      vec.resize(array->size);                                        \
+      for (size_t index = 0; index < array->size; ++index) {          \
+        vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \
+      }                                                               \
+    }                                                                 \
+    return vec;                                                       \
+  }
+#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \
+  extern "C" {                                              \
+  DESTROY_TWO_DIM_ARRAY(Type);                              \
+  }                                                         \
+  namespace paddle_infer {                                  \
+  CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type)        \
+  CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type)        \
+  }
+
+TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t)
+
+#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL
+#undef CONVERT_TWO_DIM_ARRAY_TO_VEC
+#undef CONVERT_VEC_TO_TWO_DIM_ARRAY
+#undef DESTROY_TWO_DIM_ARRAY
+
+namespace paddle_infer {
+
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) {
+  switch (place_type) {
+    case PD_PLACE_UNK:
+      return PlaceType::kUNK;
+    case PD_PLACE_CPU:
+      return PlaceType::kCPU;
+    case PD_PLACE_GPU:
+      return PlaceType::kGPU;
+    case PD_PLACE_XPU:
+      return PlaceType::kXPU;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle place type %d.", place_type));
+      return PlaceType::kUNK;
+  }
+}
+
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) {
+  switch (place_type) {
+    case PlaceType::kCPU:
+      return PD_PLACE_CPU;
+    case PlaceType::kGPU:
+      return PD_PLACE_GPU;
+    case PlaceType::kXPU:
+      return PD_PLACE_XPU;
+    default:
+      return PD_PLACE_UNK;
+  }
+}
+
+DataType CvtToCxxDatatype(PD_DataType data_type) {
+  switch (data_type) {
+    case PD_DATA_FLOAT32:
+      return DataType::FLOAT32;
+    case PD_DATA_INT64:
+      return DataType::INT64;
+    case PD_DATA_INT32:
+      return DataType::INT32;
+    case PD_DATA_UINT8:
+      return DataType::UINT8;
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Unsupport paddle data type %d.", data_type));
+      return DataType::FLOAT32;
+  }
+}
+
+PD_DataType CvtFromCxxDatatype(DataType data_type) {
+  switch (data_type) {
+    case DataType::FLOAT32:
+      return PD_DATA_FLOAT32;
+    case DataType::INT64:
+      return PD_DATA_INT64;
+    case DataType::INT32:
+      return PD_DATA_INT32;
+    case DataType::UINT8:
+      return PD_DATA_UINT8;
+    default:
+      return PD_DATA_UNK;
+  }
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h
new file mode 100644
index 0000000000000..68e519d4bb5e9
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/pd_utils.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file pd_utils.h
+///
+/// \brief Some utility function to destroy paddle struct.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pd_types.h"  // NOLINT
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+///
+/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayInt32 object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy(
+    __pd_take PD_OneDimArrayInt32* array);
+
+///
+/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArrayCstr object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy(
+    __pd_take PD_OneDimArrayCstr* array);
+
+///
+/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_OneDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy(
+    __pd_take PD_OneDimArraySize* array);
+
+///
+/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer.
+///
+/// \param[in] array pointer to the PD_TwoDimArraySize object.
+///
+PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy(
+    __pd_take PD_TwoDimArraySize* array);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h
new file mode 100644
index 0000000000000..8a61b9a884c3b
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/types_internal.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_common.h"
+
+typedef struct PD_Tensor {
+  std::unique_ptr<paddle_infer::Tensor> tensor;
+} PD_Tensor;
+
+typedef struct PD_Predictor {
+  std::shared_ptr<paddle_infer::Predictor> predictor;
+} PD_Predictor;
diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h
new file mode 100644
index 0000000000000..fbae512ecd855
--- /dev/null
+++ b/paddle/fluid/inference/capi_exp/utils_internal.h
@@ -0,0 +1,153 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+///
+/// \file utils_internal.h
+///
+/// \brief Some utility function used to convert object between C Struct and C++
+/// Class.
+///
+/// \author paddle-infer@baidu.com
+/// \date 2021-04-21
+/// \since 2.1
+///
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <vector>
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/capi_exp/pd_types.h"
+
+namespace paddle_infer {
+
+///
+/// \brief Convert the 'std::vector<int>' object to a 'PD_OneDimArrayInt32'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32(
+    const std::vector<int>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector<int>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<int> CvtOneDimArrayToVecInt32(
+    __pd_keep const PD_OneDimArrayInt32* array);
+
+///
+/// \brief Convert the 'std::vector<size_t>' object to a 'PD_OneDimArraySize'
+/// object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize(
+    const std::vector<size_t>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector<size_t>'
+/// object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<size_t> CvtOneDimArrayToVecSize(
+    __pd_keep const PD_OneDimArraySize* array);
+
+///
+/// \brief Convert the 'std::vector<std::string>' object to a
+/// 'PD_OneDimArrayCstr' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr(
+    const std::vector<std::string>& vec);
+
+///
+/// \brief Convert the 'PD_OneDimArrayCstr' object to a
+/// 'std::vector<std::string>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::string> CvtOneDimArrayToVecCstr(
+    __pd_keep const PD_OneDimArrayCstr* array);
+
+///
+/// \brief Convert the 'std::vector<std::vector<size_t>>' object to a
+/// 'PD_TwoDimArraySize' object.
+///
+/// \param[in] vec source object.
+/// \return target object.
+///
+__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize(
+    const std::vector<std::vector<size_t>>& vec);
+
+///
+/// \brief Convert the 'PD_TwoDimArraySize' object to a
+/// 'std::vector<std::vector<size_t>>' object.
+///
+/// \param[in] array source object.
+/// \return target object.
+///
+std::vector<std::vector<size_t>> CvtTwoDimArrayToVecSize(
+    __pd_keep const PD_TwoDimArraySize* array);
+
+///
+/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PlaceType CvtToCxxPlaceType(PD_PlaceType place_type);
+
+///
+/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type);
+
+///
+/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+DataType CvtToCxxDatatype(PD_DataType data_type);
+
+///
+/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType'
+/// object.
+///
+/// \param[in] place_type source object.
+/// \return target object.
+///
+PD_DataType CvtFromCxxDatatype(DataType data_type);
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 59a786e46c98b..908e1ab990bb7 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
 #endif
 
 #ifdef LITE_SUBGRAPH_WITH_XPU
+  // Deprecated in Paddle-Lite release/v2.8
   lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
       cfg.xpu_l3_workspace_size);
+  lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size,
+                                          cfg.locked);
+  lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file);
+  lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision,
+                                               cfg.adaptive_seqlen);
 #endif
 
   // create predictor
diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h
index 5ba487cc24d7d..a64ef1eda828b 100644
--- a/paddle/fluid/inference/lite/engine.h
+++ b/paddle/fluid/inference/lite/engine.h
@@ -42,6 +42,11 @@ struct EngineConfig {
 
   // for xpu
   size_t xpu_l3_workspace_size;
+  bool locked = false;
+  bool autotune = true;
+  std::string autotune_file = "";
+  std::string precision = "int16";
+  bool adaptive_seqlen = false;
 
   // for x86 or arm
   int cpu_math_library_num_threads{1};
diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
index a6484a1355705..7ea41839cb939 100644
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter {
     VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm";
 
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of batch_norm TRT converter. "
-                          "Expected 1, received %d.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Bias's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Mean's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Mean").size()));  // Mean is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid input Scale's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Variance").size(), 1,
-        platform::errors::InvalidArgument(
-            "Invalid input Variance's size of batch_norm TRT converter. "
-            "Expected 1, received %d.",
-            op_desc.Input("Variance").size()));  // Variance is a weight
-    PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Y's size of batch_norm TRT "
-                          "converter. Expected 1, received %d.",
-                          op_desc.Output("Y").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     // Declare weights
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 5515cd35daedc..ba47358b147db 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias";
 
   framework::OpDesc op_desc(op, nullptr);
-  PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 input, but got %d input.",
-                        op_desc.Input("Input").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 filter, but got %d filter.",
-                        op_desc.Input("Filter").size()));
-  PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL,
-                    platform::errors::InvalidArgument(
-                        "TRT Conv2d expect 1 output, but got %d output.",
-                        op_desc.Output("Output").size()));
 
   auto* X = engine->GetITensor(op_desc.Input("Input").front());
   std::string filter_var_name = op_desc.Input("Filter").front();
@@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   if (enable_int8) {
 #if IS_TRT_VERSION_GE(5000)
-    if (op_desc.Type() != "conv2d_transpose") {
-      PADDLE_ENFORCE_EQ(
-          op_desc.HasAttr("Input_scale"), true,
-          platform::errors::InvalidArgument("Input scale not found. TRT int8"
-                                            " requires conv/deconv to have "
-                                            "input quantization scales."));
-    }
     float in_scale =
         BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127;
     auto weight_scale =
@@ -184,14 +165,6 @@ class Deconv2dOpConverter : public OpConverter {
           return layer;
         },
         [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) {
-          // In trt Deconv, dilation should be 1, ohter values are not
-          // supported.
-          bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1);
-          PADDLE_ENFORCE_EQ(condition, true,
-                            platform::errors::InvalidArgument(
-                                "In Deconv, Dilations must be (1, 1) for "
-                                "tensorRT, but given (%d, %d)",
-                                dilations.d[0], dilations.d[1]));
         },
         "conv2d_transpose");
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 74057addecd1f..5419933e40736 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -43,25 +43,6 @@ class ElementwiseWeightOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but reveceid Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
     PADDLE_ENFORCE_NOT_NULL(
@@ -193,25 +174,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"X\").size() "
-            "should equal to 1, but received Input(\"X\").size() = %u.",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Input(\"Y\").size() "
-            "should equal to 1, but received Input(\"Y\").size() = %u.",
-            op_desc.Input("Y").size()));  // Y is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Out").size(), 1,
-        platform::errors::InvalidArgument(
-            "The input op's Output(\"Out\").size() "
-            "should equal to 1, but received Output(\"Out\").size() = %u.",
-            op_desc.Output("Out").size()));
-
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
     std::vector<nvinfer1::ITensor*> itensors;
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index f13f172454123..66a682db07b91 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -34,13 +34,17 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    auto id_names = op_desc.Input("Ids");
-    auto emb_names = op_desc.Input("Embs");
+    auto word_id_name = op_desc.Input("WordId").front();
+    auto pos_id_name = op_desc.Input("PosId").front();
+    auto sent_id_name = op_desc.Input("SentId").front();
+    auto word_emb_name = op_desc.Input("WordEmbedding").front();
+    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
+    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
+    std::vector<std::string> id_names = {word_id_name, pos_id_name,
+                                         sent_id_name};
+    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
+                                          sent_emb_name};
 
-    PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(),
-                      platform::errors::InvalidArgument(
-                          "The id and emb size of fused EmbEltwiseLayerNormOp "
-                          "should be same "));
     int input_num = id_names.size();
 
     // Declare inputs
@@ -91,99 +95,96 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     nvinfer1::ILayer* layer = nullptr;
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
-        if (enable_int8) {
-          output_fp16 = 1;
-        }
-        PADDLE_ENFORCE_EQ(
-            output_fp16, 1,
-            platform::errors::InvalidArgument(
-                "Only Precision::KHalf(fp16) is supported when infering "
-                "ernie(bert) model with config.EnableTensorRtOSS(). "
-                "But Precision::KFloat32 is setted."));
-        const std::vector<nvinfer1::PluginField> fields{
-            {"bert_embeddings_layernorm_beta", bias,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(bias_size)},
-            {"bert_embeddings_layernorm_gamma", scale,
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(scale_size)},
-            {"bert_embeddings_word_embeddings", input_embs[0],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[0])},
-            {"bert_embeddings_token_type_embeddings", input_embs[2],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[2])},
-            {"bert_embeddings_position_embeddings", input_embs[1],
-             nvinfer1::PluginFieldType::kFLOAT32,
-             static_cast<int32_t>(emb_sizes[1])},
-            {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
-        };
-
-        // remember to free
-        nvinfer1::PluginFieldCollection* plugin_ptr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_ptr) +
-                       fields.size() * sizeof(nvinfer1::PluginField)));
-        plugin_ptr->nbFields = static_cast<int>(fields.size());
-        plugin_ptr->fields = fields.data();
-
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(0)->getName()));  // word_embedding,
-                                                           // eval_placeholder_0
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(1)->getName()));  // sent_embedding,
-                                                           // eval_placeholder_1
-        plugin_inputs.emplace_back(engine_->GetITensor(
-            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
-                                                           // eval_placeholder_2
-        auto max_seqlen_tensor =
-            engine_->GetITensor(engine_->network()->getInput(3)->getName());
-        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Shuffle,
-            *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
-        nvinfer1::Dims shape_dim;
-        shape_dim.nbDims = 1;
-        shape_dim.d[0] = -1;
-        shuffle_layer->setReshapeDimensions(shape_dim);
-        plugin_inputs.emplace_back(
-            shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
-
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomEmbLayerNormPluginDynamic", "2");
-
-        auto plugin_obj = creator->createPlugin(
-            "CustomEmbLayerNormPluginDynamic", plugin_ptr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
-        layer = plugin_layer;
-        free(plugin_ptr);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
-                                 {output_name, std::string("qkv_plugin_mask")},
-                                 test_mode);
-      } else {
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        plugin::DynamicPluginTensorRT* plugin = nullptr;
-        plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
-            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-            eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
-        auto output_name = op_desc.Output("Out")[0];
-        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
-                                 test_mode);
+    if (engine_->use_oss()) {
+      int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+      if (enable_int8) {
+        output_fp16 = 1;
       }
+      PADDLE_ENFORCE_EQ(
+          input_num, 3,
+          platform::errors::InvalidArgument(
+              "When using oss and var-len, embedding_eltwise_layernorm op"
+              "should have 3 inputs only, but got %d.",
+              input_num));
+      PADDLE_ENFORCE_EQ(
+          output_fp16, 1,
+          platform::errors::InvalidArgument(
+              "Only Precision::KHalf(fp16) is supported when infering "
+              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "But Precision::KFloat32 is setted."));
+      const std::vector<nvinfer1::PluginField> fields{
+          {"bert_embeddings_layernorm_beta", bias,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(bias_size)},
+          {"bert_embeddings_layernorm_gamma", scale,
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(scale_size)},
+          {"bert_embeddings_word_embeddings", input_embs[0],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[0])},
+          {"bert_embeddings_token_type_embeddings", input_embs[2],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[2])},
+          {"bert_embeddings_position_embeddings", input_embs[1],
+           nvinfer1::PluginFieldType::kFLOAT32,
+           static_cast<int32_t>(emb_sizes[1])},
+          {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
+      };
+
+      // remember to free
+      nvinfer1::PluginFieldCollection* plugin_ptr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*plugin_ptr) +
+                     fields.size() * sizeof(nvinfer1::PluginField)));
+      plugin_ptr->nbFields = static_cast<int>(fields.size());
+      plugin_ptr->fields = fields.data();
+
+      std::vector<nvinfer1::ITensor*> plugin_inputs;
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(word_id_name));  // word_embedding,
+                                               // eval_placeholder_0
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(sent_id_name));  // sent_embedding,
+                                               // eval_placeholder_1
+      plugin_inputs.emplace_back(
+          engine_->GetITensor(pos_id_name));  // cu_seqlens,
+                                              // eval_placeholder_2
+      auto max_seqlen_tensor =
+          engine_->GetITensor(engine_->network()->getInput(3)->getName());
+      auto* shuffle_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor);
+      nvinfer1::Dims shape_dim;
+      shape_dim.nbDims = 1;
+      shape_dim.d[0] = -1;
+      shuffle_layer->setReshapeDimensions(shape_dim);
+      plugin_inputs.emplace_back(
+          shuffle_layer->getOutput(0));  // max_seqlen, eval_placeholder_3
+
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomEmbLayerNormPluginDynamic", "2");
+
+      auto plugin_obj =
+          creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+      layer = plugin_layer;
+      free(plugin_ptr);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
+                               {output_name, std::string("qkv_plugin_mask")},
+                               test_mode);
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      plugin::DynamicPluginTensorRT* plugin = nullptr;
+      plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+          eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin);
+      auto output_name = op_desc.Output("Out")[0];
+      RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                               test_mode);
     }
 
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 194d76c737c7f..6167e68df2b67 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -160,66 +160,67 @@ class FcOpConverter : public OpConverter {
     if (engine_->with_dynamic_shape()) {
       // not NCHW layout, but NLP layout with added 'x 1 x 1'
       auto x_dim = X->getDimensions();
-      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
-        auto output_name = op_desc.Output("Out").front();
-        // add shuffle before fc
-        nvinfer1::Dims reshape_before_fc_dim;
-        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
-        for (int i = 0; i < x_dim.nbDims; i++) {
-          reshape_before_fc_dim.d[i] = 0;
-        }
-        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
-        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
-        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+      if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 &&
+          x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) {
+        // fc which is just after self attention
+        regist_fc(X, n_output, weight, bias);
+        return;
+      }
+      PADDLE_ENFORCE_LE(
+          x_dim.nbDims - x_num_col_dims, 3,
+          platform::errors::InvalidArgument(
+              "Params and input dims mismatch. Paddle-TRT FC "
+              "converter expects x_dim.nbDims - x_num_col_dims <= 3, but "
+              "x_dim.nbDims = %d, x_num_col_dims = %d.",
+              x_dim.nbDims, x_num_col_dims));
+      auto output_name = op_desc.Output("Out").front();
+      // add shuffle before fc
+      nvinfer1::Dims reshape_before_fc_dim;
+      // padding shape "x 1 x 1"
+      int padding_length = 3 - (x_dim.nbDims - x_num_col_dims);
+      reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length;
+      int cur_dim_index = reshape_before_fc_dim.nbDims - 1;
+      while (padding_length-- > 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 1;
+      }
+      while (cur_dim_index >= 0) {
+        reshape_before_fc_dim.d[cur_dim_index--] = 0;
+      }
 
-        // add fc layer
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
-            n_output, weight.get(), bias.get());
-        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+      auto* reshape_before_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+      reshape_before_fc_layer->setName(
+          ("shuffle_before_fc(Output: " + output_name + ")").c_str());
 
-        // add shuffle after fc
-        nvinfer1::Dims reshape_after_fc_dim;
-        if (x_dim.nbDims == 3) {
-          if (x_num_col_dims == 2) {
-            reshape_after_fc_dim.nbDims = 3;
-            reshape_after_fc_dim.d[0] = 0;
-            reshape_after_fc_dim.d[1] = 0;
-            reshape_after_fc_dim.d[2] = 0;
-          } else {
-            reshape_after_fc_dim.nbDims = 2;
-            reshape_after_fc_dim.d[0] = 0;
-            auto dim = fc_layer->getOutput(0)->getDimensions();
-            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
-          }
-          // x_dim.nbDims == 2
-        } else {
-          reshape_after_fc_dim.nbDims = 2;
-          reshape_after_fc_dim.d[0] = 0;
-          reshape_after_fc_dim.d[1] = 0;
-        }
-        auto* reshape_after_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+      // add fc layer
+      auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+          n_output, weight.get(), bias.get());
+      fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
 
-        if (activation_type == "relu") {
-          reshape_after_fc_layer->setName(
-              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
-          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
-              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
-              nvinfer1::ActivationType::kRELU);
-          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
-                                   {output_name}, test_mode);
-        } else {
-          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
-                                   {output_name}, test_mode);
-        }
+      // add shuffle after fc
+      nvinfer1::Dims reshape_after_fc_dim;
+      reshape_after_fc_dim.nbDims = x_num_col_dims + 1;
+      for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) {
+        reshape_after_fc_dim.d[i] = 0;
+      }
+
+      auto* reshape_after_fc_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+      reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+
+      if (activation_type == "relu") {
+        reshape_after_fc_layer->setName(
+            ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+        nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+            nvinfer1::ActivationType::kRELU);
+        RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                 {output_name}, test_mode);
       } else {
-        regist_fc(X, n_output, weight, bias);
+        RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                 {output_name}, test_mode);
       }
       return;
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
index ca5b6a8b52e79..0436499cd4075 100644
--- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc
@@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1,
-                      platform::errors::InvalidArgument(
-                          "gelu op has only 1 output, but got %d", output_num));
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
index 9dc40ceec4809..7ef79e547d09a 100644
--- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc
@@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     int input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(
-        input_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 input, but got %d", input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(
-        output_num, 1,
-        platform::errors::InvalidArgument(
-            "HardSwish op has only 1 output, but got %d", output_num));
 
     const float threshold =
         op_desc.HasAttr("threshold")
diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index c1f266bacfec5..0b97b5d87a3d5 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter {
                   const framework::Scope& scope, bool test_mode) override {
     VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("X").size(), 1,
-        platform::errors::InvalidArgument(
-            "input of layer_norm op converter should be 1, got %d",
-            op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1,
-                      platform::errors::InvalidArgument(
-                          "Bias of layer_norm op converter should be 1, got %d",
-                          op_desc.Input("Bias").size()));  // Bias is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Input("Scale").size(), 1,
-        platform::errors::InvalidArgument(
-            "Scale of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Scale").size()));  // Scale is a weight
-    PADDLE_ENFORCE_EQ(
-        op_desc.Output("Y").size(), 1,
-        platform::errors::InvalidArgument(
-            "output of layer_norm op converter should be 1, got %d",
-            op_desc.Input("Y").size()));
 
     auto* X = engine_->GetITensor(op_desc.Input("X").front());
     auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front());
diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
index c2ffb3f3197c1..d6277b5208d5a 100644
--- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc
@@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter {
     VLOG(4) << "convert fluid leaky_relu op to tensorrt layer";
 
     framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "inputs. Expected 1, but received %d",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid number of TRT leaky_relu op converter "
-                          "outputs. Expected 1, but received %d",
-                          output_num));
     // Get attrs
     float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
     nvinfer1::ILayer* output_layer = nullptr;
diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
index e91a2ee13f4c2..3940cc5dce1b0 100644
--- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc
@@ -65,13 +65,6 @@ class NearestInterpolateOpConverter : public OpConverter {
       scale_w = scale;
     } else {
       // axis are different in static/dynamic mode
-      PADDLE_ENFORCE_GT(
-          out_h, 0, platform::errors::InvalidArgument(
-                        "out_h must be greater than 0 if scale is not set."));
-      PADDLE_ENFORCE_GT(
-          out_w, 0, platform::errors::InvalidArgument(
-                        "out_w must be greater than 0 if scale is not set."));
-
       bool with_dynamic = engine_->with_dynamic_shape();
 
       int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic;
diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
index 6bf50e4742dd2..d6711bbbd2cb5 100644
--- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc
@@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter {
 
     const std::vector<int> paddings =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
-    const float pad_value =
-        BOOST_GET_CONST(float, op_desc.GetAttr("pad_value"));
 
     nvinfer1::Dims input_shape = input->getDimensions();
     int nbDims = input_shape.nbDims;
@@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter {
                                           "(nbDims + 1) * 2 == pad_size. But "
                                           "received nbDims:%d, pad_size:%d.",
                                           nbDims, pad_size));
-    PADDLE_ENFORCE_EQ(pad_value, 0.0,
-                      platform::errors::InvalidArgument(
-                          "The pad layer of TRT only support zero."));
 
     nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]);
     nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index c10072602d7c5..90d6392fd6404 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter {
     VLOG(4)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
-    PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 input, but got %d input.",
-                          op_desc.Input("X").size()));
-    PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "TRT Pool2d expect 1 Output, but got %d output.",
-                          op_desc.Output("Out").size()));
-
     auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
     nvinfer1::Dims input_shape = input1->getDimensions();
     int input_dims = input_shape.nbDims;
@@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter {
       nv_pool_type = nvinfer1::PoolingType::kAVERAGE;
       reduce_operation = nvinfer1::ReduceOperation::kAVG;
       plugin_pool_type = plugin::PoolPlugin::PoolType::avg;
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "Wrong pool op type, the trt do not support the %s pool type.",
-          pool_type));
     }
 
     nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index 74d77d8be4493..a8a36e1238168 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter {
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     size_t input_num = op_desc.Input("X").size();
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
-    // Get output
-    size_t output_num = op_desc.Output("Out").size();
-    PADDLE_ENFORCE_EQ(output_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid output Out's size of prelu TRT converter. "
-                          "Expected 1, received %d.",
-                          output_num));
     // Get attrs
     std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode"));
     //
diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
index 1329608aecd20..654fe7e013379 100644
--- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc
@@ -62,12 +62,6 @@ class RoiAlignOpConverter : public OpConverter {
     std::vector<nvinfer1::ITensor*> inputs{input_tensor, rois_tensor};
     nvinfer1::ILayer* layer = nullptr;
 
-    PADDLE_ENFORCE_EQ(
-        engine_->with_dynamic_shape(), true,
-        platform::errors::InvalidArgument(
-            "TRT roi align plugin only accept the dynamic shape, because that "
-            "the roi_align will change the batch size."));
-
     auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic(
         data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio);
     auto roi_align_layer = engine_->network()->addPluginV2(
diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
index bf1f82076a66c..0fdc262f7e740 100644
--- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc
@@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter {
     int w = input_dims.d[2];
     int group = BOOST_GET_CONST(int, op_desc.GetAttr("group"));
 
-    if (engine_->with_dynamic_shape()) {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, "
-          "the shuffle_channel op does not support dynamic shape yet"));
-    }
-
     auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
     nvinfer1::Dims4 reshape_dim(group, c / group, h, w);
     layer->setReshapeDimensions(reshape_dim);
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index b44bdcef7123c..e621ac0514109 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -52,57 +52,57 @@ class SkipLayerNormOpConverter : public OpConverter {
     bool enable_int8 = op_desc.HasAttr("enable_int8");
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
-      if (engine_->use_oss()) {
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomSkipLayerNormPluginDynamic", "2");
-        assert(creator != nullptr);
-        int type = static_cast<int>((engine_->WithFp16() == 1)
-                                        ? nvinfer1::DataType::kHALF
-                                        : nvinfer1::DataType::kFLOAT);
-        int ld = input1->getDimensions().d[2];  // hidden dimension
-        assert(ld > 0);
-
-        if (enable_int8) {
-          type = static_cast<int>(nvinfer1::DataType::kHALF);
-        }
-
-        const std::vector<nvinfer1::PluginField> fields{
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
-            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
-            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
-        };
-        nvinfer1::PluginFieldCollection* pluginPtr =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*pluginPtr) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        pluginPtr->nbFields = static_cast<int>(fields.size());
-        pluginPtr->fields = fields.data();
-
-        auto pluginObj = creator->createPlugin(
-            "CustomSkipLayerNormPluginDynamic", pluginPtr);
-        auto plugin_layer = engine_->network()->addPluginV2(
-            inputs.data(), inputs.size(), *pluginObj);
-
-        assert(plugin_layer != nullptr);
-        layer = plugin_layer;
-      } else {
-        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-        plugin::SkipLayerNormPluginDynamic* plugin =
-            new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
-                                                   scale_size, eps, with_fp16);
-        layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
+
+    if (engine_->use_oss()) {
+      auto creator = GetPluginRegistry()->getPluginCreator(
+          "CustomSkipLayerNormPluginDynamic", "2");
+      PADDLE_ENFORCE_NE(
+          creator, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to get creator of CustomSkipLayerNormPluginDynamic"));
+      int type = static_cast<int>((engine_->WithFp16() == 1)
+                                      ? nvinfer1::DataType::kHALF
+                                      : nvinfer1::DataType::kFLOAT);
+      int ld = input1->getDimensions().d[2];  // hidden dimension
+      PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument(
+                                   "in CustomSkipLayerNormPluginDynamic hidden "
+                                   "dimension should > 0"));
+      if (enable_int8) {
+        type = static_cast<int>(nvinfer1::DataType::kHALF);
       }
+
+      const std::vector<nvinfer1::PluginField> fields{
+          {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+          {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
+          {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+          {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+      };
+      nvinfer1::PluginFieldCollection* pluginPtr =
+          static_cast<nvinfer1::PluginFieldCollection*>(
+              malloc(sizeof(*pluginPtr) +
+                     fields.size() *
+                         sizeof(nvinfer1::PluginField)));  // remember to free
+      pluginPtr->nbFields = static_cast<int>(fields.size());
+      pluginPtr->fields = fields.data();
+
+      auto pluginObj =
+          creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+      auto plugin_layer = engine_->network()->addPluginV2(
+          inputs.data(), inputs.size(), *pluginObj);
+
+      PADDLE_ENFORCE_NE(
+          plugin_layer, nullptr,
+          platform::errors::InvalidArgument(
+              "fail to add CustomSkipLayerNormPluginDynamic layer"));
+      layer = plugin_layer;
     } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
+      float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+      plugin::SkipLayerNormPluginDynamic* plugin =
+          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                 scale_size, eps, with_fp16);
+      layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index aee39b7cf0c14..2ab024dff327f 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -44,15 +44,6 @@ class SliceOpConverter : public OpConverter {
     std::vector<int> ends =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
 
-    PADDLE_ENFORCE_EQ(
-        starts.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of starts must be equal to the size of axes."));
-    PADDLE_ENFORCE_EQ(
-        ends.size(), axes.size(),
-        platform::errors::InvalidArgument(
-            "The size of ends must be equal to the size of axes."));
-
     auto input_dims = input->getDimensions();
     if (!engine_->with_dynamic_shape()) {
       // notice that input shape is [CHW] without batch axis when input has
@@ -62,10 +53,6 @@ class SliceOpConverter : public OpConverter {
       }
       input_dims.d[0] = 1;  // fake batchsize, not useful here
       for (size_t i = 0; i < axes.size(); i++) {
-        // split on batch is not supported in TensorRT
-        PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument(
-                                          "Invalid slice axis. Slice on batch "
-                                          "axis is not supported in TensorRT"));
         if (starts[i] < 0) {
           starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
         }
diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc
index 75b317e7bfd90..47a6dd783a70c 100644
--- a/paddle/fluid/inference/tensorrt/convert/split_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc
@@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter {
     size_t output_num = op_desc.Output("Out").size();
 
     // Get Attrs
-    PADDLE_ENFORCE_EQ(input_num, 1UL,
-                      platform::errors::InvalidArgument(
-                          "Invalid input X's size of split TRT converter. "
-                          "Expected 1, received %d.",
-                          input_num));
     int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    // split on batch is not supported in TensorRT
-    PADDLE_ENFORCE_NE(
-        axis, 0,
-        platform::errors::InvalidArgument(
-            "Invalid split axis. Split on batch is not supported in TensorRT"));
 
     std::vector<int> output_lengths =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("sections"));
diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
index a0292b2112463..6105e10799e55 100644
--- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc
@@ -58,26 +58,19 @@ class StackOpConverter : public OpConverter {
     }
 
     nvinfer1::ILayer* layer = nullptr;
-    if (engine_->with_dynamic_shape()) {
 #if IS_TRT_VERSION_GE(6000)
-      bool with_fp16 =
-          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
-      plugin::StackPluginDynamic* plugin =
-          new plugin::StackPluginDynamic(axis, input_num, with_fp16);
-      layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
-      assert(layer != nullptr);
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::StackPluginDynamic* plugin =
+        new plugin::StackPluginDynamic(axis, input_num, with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs, input_num, plugin);
+    PADDLE_ENFORCE_NOT_NULL(
+        layer, platform::errors::InvalidArgument(
+                   "trt stack layer in converter could not be created."));
 #else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+    PADDLE_THROW(platform::errors::Fatal(
+        "You are running the TRT Dynamic Shape mode, need to confirm that "
+        "your TRT version is no less than 6.0"));
 #endif
-    } else {
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the Ernie(Bert) model in static"
-          "shape mode, which is not supported for the time being.\n"
-          "You can use the config.SetTRTDynamicShapeInfo(...) interface"
-          " to set the shape information to run the dynamic shape mode."));
-    }
     auto output_name = op_desc.Output("Y").front();
     RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode);
     free(inputs);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index 971f99e691972..6158fd130bad8 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
 static nvinfer1::IPluginRegistry* GetPluginRegistry() {
   return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
 }
+static int GetInferLibVersion() {
+  return static_cast<int>(dy::getInferLibVersion());
+}
 #endif
 
 // A logger for create TensorRT infer builder.
@@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger {
  public:
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
     switch (severity) {
-      case Severity::kINFO:
+      case Severity::kVERBOSE:
         VLOG(3) << msg;
         break;
+      case Severity::kINFO:
+        VLOG(2) << msg;
+        break;
       case Severity::kWARNING:
         LOG(WARNING) << msg;
         break;
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 53225b7978077..c8dfc169535da 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -42,15 +42,13 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("multihead_matmul");
     teller_set.insert("skip_layernorm");
     teller_set.insert("slice");
-#endif
-#if IS_TRT_VERSION_GE(7130)
-    teller_set.insert("group_norm");
+    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
     int8_teller_set.insert("multihead_matmul");
     int8_teller_set.insert("skip_layernorm");
-    int8_teller_set.insert("fused_embedding_eltwise_layernorm");
-    int8_teller_set.insert("matmul");
-    int8_teller_set.insert("stack");
     int8_teller_set.insert("slice");
+#endif
+#if IS_TRT_VERSION_GE(7130)
+    teller_set.insert("group_norm");
 #endif
   }
 
@@ -67,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
   // use this set for no calib int8.
   std::unordered_set<std::string> int8_teller_set{"mul",
                                                   "conv2d",
+                                                  "matmul",
+                                                  "stack",
                                                   "conv2d_fusion",
                                                   "pool2d",
                                                   "relu",
@@ -137,13 +137,93 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     return false;
 
   for (auto& teller : tellers_) {
-    if (op_type == "pool2d" || op_type == "conv2d" ||
-        op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") {
+    if (op_type == "depthwise_conv2d") {
       std::vector<int> paddings =
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
     }
+
+    if (op_type == "pool2d") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+      if (paddings.size() > 2) return false;
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "TRT Pool2d expect 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "TRT Pool2d has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+      if (!desc.HasAttr("pooling_type")) {
+        return false;
+      } else {
+        std::string pool_type =
+            BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type"));
+        if (pool_type != "max" && pool_type != "avg") {
+          VLOG(3) << "Wrong pool op type, the trt do not support the "
+                  << pool_type << " pool type.";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "conv2d" || op_type == "conv2d_transpose" ||
+        op_type == "conv2d_fusion") {
+      std::vector<int> paddings =
+          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
+
+      // conv2d and conv2d_transpose need padding check
+      if (paddings.size() > 2 && op_type != "conv2d_fusion") return false;
+
+      if (desc.Input("Input").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 input, but got "
+                << desc.Input("Input").size() << " input.";
+        return false;
+      }
+
+      if (desc.Input("Filter").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 filter, but got "
+                << desc.Input("Filter").size() << " filter.";
+        return false;
+      }
+
+      if (desc.HasAttr("enable_int8")) {
+        if (op_type == "conv2d" || op_type == "conv2d_fusion") {
+          if (!desc.HasAttr("Input_scale")) {
+            VLOG(3) << "Input scale not found. TRT int8"
+                       " requires conv/deconv to have "
+                       "input quantization scales.";
+            return false;
+          }
+        }
+      }
+
+      if (op_type == "conv2d_transpose") {
+        if (!desc.HasAttr("dilations")) {
+          return false;
+        } else {
+          const std::vector<int> dilations =
+              BOOST_GET_CONST(std::vector<int>, desc.GetAttr("dilations"));
+          if (dilations[0] != 1 || dilations[1] != 1) {
+            VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for "
+                       "tensorRT, but given ("
+                    << dilations[0] << ", " << dilations[1] << ")";
+            return false;
+          }
+        }
+      }
+
+      if (desc.Output("Output").size() != 1) {
+        VLOG(3) << "TRT Conv2d expect 1 output, but got "
+                << desc.Output("Output").size() << " output.";
+        return false;
+      }
+    }
+
     if (op_type == "matmul") {
       auto* block = desc.Block();
       for (auto& param_name : desc.Inputs()) {
@@ -151,7 +231,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() < 3) {
-            VLOG(1)
+            VLOG(3)
                 << "matmul op dims < 3 not supported in tensorrt, but got dims "
                 << shape.size() << ", so jump it.";
             return false;
@@ -189,7 +269,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false;
       }
     }
-    if (op_type == "flatten2" || op_type == "flatten") {
+    if (op_type == "flatten2") {
+      // flatten doesn't support dynamic shape currently
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        if (with_dynamic_shape) return false;
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis != 1) return false;
+      }
+    }
+
+    if (op_type == "flatten") {
       // flatten doesn't support dynamic shape currently
       if (!desc.HasAttr("axis")) {
         return false;
@@ -229,7 +320,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           auto* var_desc = block->FindVar(var_name);
           const auto shape = var_desc->GetShape();
           if (shape.size() != 3) {
-            VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, "
+            VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, "
                        "but got dims "
                     << shape.size() << ", so jump it.";
             return false;
@@ -252,18 +343,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (registry == nullptr) return false;
     }
 
-    if (op_type == "fc" || op_type == "mul") {
-      const int x_num_col_dims =
-          desc.HasAttr("x_num_col_dims")
-              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
-              : (desc.HasAttr("in_num_col_dims")
-                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
-                     : 1);
-      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
-        return false;
-      }
-    }
-
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
@@ -279,6 +358,25 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto interp_method =
           BOOST_GET_CONST(std::string, desc.GetAttr("interp_method"));
       if (interp_method != "nearest") return false;
+
+      if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") ||
+          !desc.HasAttr("out_w")) {
+        return false;
+      } else {
+        auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale"));
+        auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
+        auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
+        if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) {
+          if (out_h <= 0) {
+            VLOG(3) << "out_h must be greater than 0 if scale is not set.";
+            return false;
+          }
+          if (out_w <= 0) {
+            VLOG(3) << "out_w must be greater than 0 if scale is not set.";
+            return false;
+          }
+        }
+      }
     }
 
     if (op_type == "roi_align") {
@@ -303,6 +401,235 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (spatial_scale <= 0.f) return false;
     }
 
+    if (op_type == "hard_swish") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "HardSwish op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "batch_norm") {
+      const std::vector<std::string> bn_inputs = {"X", "Bias", "Mean", "Scale",
+                                                  "Variance"};
+      for (unsigned int i = 0; i < bn_inputs.size(); i++) {
+        if (desc.Input(bn_inputs[i]).size() != 1) {
+          VLOG(3) << "Invalid " << bn_inputs[i]
+                  << "'s size of batch_norm TRT "
+                     "converter. Expected 1, received "
+                  << desc.Input(bn_inputs[i]).size() << ".";
+          return false;
+        }
+      }
+
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "Invalid output Y's size of batch_norm TRT "
+                   "converter. Expected 1, received "
+                << desc.Output("Y").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "split") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of split TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (!desc.HasAttr("axis")) {
+        return false;
+      } else {
+        int axis = BOOST_GET_CONST(int, desc.GetAttr("axis"));
+        if (axis == 0) {
+          VLOG(3) << "Invalid split axis. Split on batch is not supported in "
+                     "TensorRT";
+          return false;
+        }
+      }
+    }
+
+    if (op_type == "slice") {
+      if (!desc.HasAttr("axes") || !desc.HasAttr("starts") ||
+          !desc.HasAttr("ends")) {
+        return false;
+      } else {
+        std::vector<int> axes =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("axes"));
+        std::vector<int> starts =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("starts"));
+        std::vector<int> ends =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("ends"));
+        if (axes.size() != starts.size() || axes.size() != ends.size()) {
+          return false;
+        }
+        if (!with_dynamic_shape) {
+          for (size_t i = 0; i < axes.size(); i++) {
+            if (axes[i] == 0) {
+              VLOG(3) << "Invalid slice axis. Slice on batch axis is not "
+                         "supported in TensorRT";
+              return false;
+            }
+          }
+        }
+      }
+    }
+
+    if (op_type == "elementwise_add" || op_type == "elementwise_mul") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "The input op's Input(\"X\").size() "
+                   "should equal to 1, but received Input(\"X\").size() = "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Input("Y").size() != 1) {
+        VLOG(3) << "The input op's Input(\"Y\").size() "
+                   "should equal to 1, but received Input(\"Y\").size() = "
+                << desc.Input("Y").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "The input op's Output(\"Out\").size() "
+                   "should equal to 1, but reveceid Output(\"Out\").size() = "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "stack") {
+      if (!with_dynamic_shape) {
+        VLOG(3)
+            << "static shape mode is not supported for TRT stack.\n"
+               "You can use the config.SetTRTDynamicShapeInfo(...) interface"
+               " to set the shape information to run the dynamic shape "
+               "mode.";
+        return false;
+      }
+    }
+
+    if (op_type == "fused_embedding_eltwise_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic "
+                   "shape mode.";
+        return false;
+      }
+      if (desc.Input("Ids").size() != desc.Input("Embs").size()) {
+        VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp "
+                   "should be same ";
+        return false;
+      }
+    }
+
+    if (op_type == "gelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "gelu op has only 1 input, but got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "gelu op has only 1 output, but got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "layer_norm") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "input of layer_norm op converter should be 1, got "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Input("Bias").size() != 1) {
+        VLOG(3) << "Bias of layer_norm op converter should be 1, got "
+                << desc.Input("Bias").size();
+        return false;
+      }
+      if (desc.Input("Scale").size() != 1) {
+        VLOG(3) << "Scale of layer_norm op converter should be 1, got "
+                << desc.Input("Scale").size();
+        return false;
+      }
+      if (desc.Output("Y").size() != 1) {
+        VLOG(3) << "output of layer_norm op converter should be 1, got "
+                << desc.Output("Y").size();
+        return false;
+      }
+    }
+
+    if (op_type == "leaky_relu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid number of TRT leaky_relu op converter "
+                   "inputs. Expected 1, but received "
+                << desc.Input("X").size();
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "output of leaky_relu op converter should be 1, got "
+                << desc.Output("Out").size();
+        return false;
+      }
+    }
+
+    if (op_type == "pad") {
+      const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value"));
+      if (pad_value != 0.0f) {
+        VLOG(3) << "The pad layer of TRT only support zero.";
+        return false;
+      }
+    }
+
+    if (op_type == "prelu") {
+      if (desc.Input("X").size() != 1) {
+        VLOG(3) << "Invalid input X's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Input("X").size() << ".";
+        return false;
+      }
+      if (desc.Output("Out").size() != 1) {
+        VLOG(3) << "Invalid output Out's size of prelu TRT converter. "
+                   "Expected 1, received "
+                << desc.Output("Out").size() << ".";
+        return false;
+      }
+    }
+
+    if (op_type == "roi_align") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "TRT roi align plugin only accept the dynamic shape, "
+                   "because that "
+                   "the roi_align will change the batch size.";
+        return false;
+      }
+    }
+
+    if (op_type == "shuffle_channel") {
+      if (with_dynamic_shape) {
+        VLOG(3) << "You are running the TRT Dynamic Shape mode, "
+                   "the shuffle_channel op does not support dynamic shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "skip_layernorm") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the skip_layernorm does not support static shape yet";
+        return false;
+      }
+    }
+
+    if (op_type == "multihead_matmul") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the multihead_matmul does not support static shape yet";
+        return false;
+      }
+    }
+
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 75628adbe8a85..f74cd671d6dca 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -522,10 +522,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
     inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/)
-    inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
-    inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc
+    inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models)
             
@@ -604,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 
-inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
-            ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${RESNET50_MODEL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
+
+inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
+        ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
 
-inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc
+if (NOT APPLE AND NOT WIN32)
+    inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model)
-
+endif()
 inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${OCR_INSTALL_DIR}/model)        
@@ -621,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t
             ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt)    
             
 if(WITH_MKLDNN)
-  inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc
+  inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
             ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model)
- endif()
+endif()
 
-inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc 
+inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc 
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c
         ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model)
 
 if(WITH_GPU)
-  inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
+    inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
new file mode 100644
index 0000000000000..de9e2afd705f9
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, gpu_interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+  const char* ops_name = "conv_2d";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+  int gpu_device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(gpu_device_id, 0);
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+
+  const char* tensor_name = "image";
+  size_t shapes_num[1] = {4};
+  int32_t min_shape[4] = {1, 3, 36, 36};
+  int32_t max_shape[4] = {1, 3, 224, 224};
+  int32_t opt_shape[4] = {1, 3, 224, 224};
+  int32_t* min_shape_ptr = min_shape;
+  int32_t* max_shape_ptr = max_shape;
+  int32_t* opt_shape_ptr = opt_shape;
+  PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num,
+                                  &min_shape_ptr, &max_shape_ptr,
+                                  &opt_shape_ptr, FALSE);
+  PD_ConfigDisableTensorRtOPs(config, 1, &ops_name);
+  PD_ConfigEnableTensorRtOSS(config);
+  bool oss_enabled = PD_ConfigTensorRtOssEnabled(config);
+  EXPECT_TRUE(oss_enabled);
+
+  PD_ConfigEnableTensorRtDla(config, 4);
+  bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config);
+  EXPECT_TRUE(dla_enabled);
+
+  PD_ConfigEnableGpuMultiStream(config);
+  bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config);
+  EXPECT_TRUE(thread_local_thread);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(num_thread, 10);
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char* model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  bool use_gpu = PD_ConfigUseGpu(config);
+  EXPECT_TRUE(use_gpu);
+  int device_id = PD_ConfigGpuDeviceId(config);
+  EXPECT_EQ(device_id, 0);
+  int init_size = PD_ConfigMemoryPoolInitSizeMb(config);
+  EXPECT_EQ(init_size, 100);
+
+  float frac = PD_ConfigFractionOfGpuMemoryForPool(config);
+  LOG(INFO) << frac;
+
+  PD_ConfigEnableCudnn(config);
+  bool cudnn = PD_ConfigCudnnEnabled(config);
+  EXPECT_TRUE(cudnn);
+
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32,
+                                FALSE, FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_int8) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE,
+                                TRUE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_ConfigDestroy(config);
+}
+
+TEST(PD_Config, trt_fp16) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigEnableUseGpu(config, 100, 0);
+  PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE,
+                                FALSE);
+  bool trt_enable = PD_ConfigTensorRtEngineEnabled(config);
+  EXPECT_TRUE(trt_enable);
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
new file mode 100644
index 0000000000000..d3a15cb285772
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  LOG(INFO) << "The inputs' size is: " << input_names->size;
+  EXPECT_EQ(input_names->size, 2u);
+
+  int32_t shape_0[4] = {1, 3, 224, 224};
+  float data_0[1 * 3 * 224 * 224] = {0};
+  PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image");
+  PD_TensorReshape(input_0, 4, shape_0);
+  PD_TensorCopyFromCpuFloat(input_0, data_0);
+  int32_t shape_1[2] = {1, 1};
+  int64_t data_1[1] = {0};
+  PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label");
+  PD_TensorReshape(input_1, 2, shape_1);
+  PD_TensorCopyFromCpuInt64(input_1, data_1);
+
+  LOG(INFO) << "Run Inference in CAPI encapsulation. ";
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  LOG(INFO) << "output size is: " << output_names->size;
+  for (size_t index = 0; index < output_names->size; ++index) {
+    LOG(INFO) << "output[" << index
+              << "]'s name is: " << output_names->data[index];
+    PD_Tensor* output =
+        PD_PredictorGetOutputHandle(predictor, output_names->data[index]);
+    PD_OneDimArrayInt32* shape = PD_TensorGetShape(output);
+    LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size;
+    int32_t out_size = 1;
+    for (size_t i = 0; i < shape->size; ++i) {
+      LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i];
+      out_size = out_size * shape->data[i];
+    }
+    float* out_data = new float[out_size];
+    PD_TensorCopyToCpuFloat(output, out_data);
+    LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0];
+    delete[] out_data;
+    PD_OneDimArrayInt32Destroy(shape);
+    PD_TensorDestroy(output);
+  }
+  PD_PredictorClearIntermediateTensor(predictor);
+  PD_PredictorTryShrinkMemory(predictor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(input_1);
+  PD_TensorDestroy(input_0);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
new file mode 100644
index 0000000000000..4369cd78dfa37
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_PredictorRun, predictor_run) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/param").c_str());
+  PD_ConfigDisableGpu(config);
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  size_t input_num = PD_PredictorGetInputNum(predictor);
+  LOG(INFO) << "Input num: " << input_num;
+  size_t output_num = PD_PredictorGetOutputNum(predictor);
+  LOG(INFO) << "Output num: " << output_num;
+
+  PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor);
+  EXPECT_EQ(input_names->size, 2u);
+  LOG(INFO) << "Predictor start run!";
+  PD_Tensor *inputs[2];
+  inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]);
+  LOG(INFO) << "Predictor start run!";
+  // inputs[0]: word, use lod memory in stack
+  int32_t shape_0[2] = {11, 1};
+  int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9};
+  size_t lod_layer_0[2] = {0, 11};
+  PD_OneDimArraySize layer_0;
+  layer_0.size = 2;
+  layer_0.data = lod_layer_0;
+  PD_OneDimArraySize *layer_0_ptr = &layer_0;
+  PD_TwoDimArraySize lod_0;
+  lod_0.size = 1;
+  lod_0.data = &layer_0_ptr;
+  PD_TensorReshape(inputs[0], 2, shape_0);
+  PD_TensorCopyFromCpuInt64(inputs[0], data_0);
+  PD_TensorSetLod(inputs[0], &lod_0);
+
+  // inputs[1]: mention, use lod memory in heap
+  int32_t shape_1[2] = {11, 1};
+  int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2};
+  PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize();
+  lod_1_ptr->size = 1;
+  lod_1_ptr->data = new PD_OneDimArraySize *[1];
+  lod_1_ptr->data[0] = new PD_OneDimArraySize();
+  lod_1_ptr->data[0]->size = 2;
+  lod_1_ptr->data[0]->data = new size_t[2];
+  lod_1_ptr->data[0]->data[0] = 0;
+  lod_1_ptr->data[0]->data[1] = 11;
+
+  PD_TensorReshape(inputs[1], 2, shape_1);
+  PD_TensorCopyFromCpuInt64(inputs[1], data_1);
+  PD_TensorSetLod(inputs[1], lod_1_ptr);
+  // retrieve the lod memory
+  delete[] lod_1_ptr->data[0]->data;
+  delete lod_1_ptr->data[0];
+  delete[] lod_1_ptr->data;
+  delete lod_1_ptr;
+  lod_1_ptr = nullptr;
+
+  LOG(INFO) << "Predictor start run!";
+  bool success = PD_PredictorRun(predictor);
+  EXPECT_TRUE(success);
+  LOG(INFO) << "Predictor run success!";
+  PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor *output =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output);
+
+  PD_TwoDimArraySizeDestroy(output_lod);
+  PD_TensorDestroy(output);
+  PD_OneDimArrayCstrDestroy(output_names);
+
+  PD_TensorDestroy(inputs[0]);
+  PD_TensorDestroy(inputs[1]);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
new file mode 100644
index 0000000000000..18107704ae420
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+TEST(PD_Config, interface) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  std::string prog_file = model_dir + "/__model__";
+  std::string param_file = model_dir + "/__params__";
+  std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir";
+
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  std::string model_dir_ = PD_ConfigGetModelDir(config);
+  EXPECT_EQ(model_dir, model_dir_);
+
+  PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str());
+  PD_ConfigSetProgFile(config, prog_file.c_str());
+  PD_ConfigSetParamsFile(config, param_file.c_str());
+  PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str());
+  std::string prog_file_ = PD_ConfigGetProgFile(config);
+  std::string param_file_ = PD_ConfigGetParamsFile(config);
+  EXPECT_EQ(prog_file, prog_file_);
+  EXPECT_EQ(param_file, param_file_);
+
+  PD_ConfigDisableFCPadding(config);
+  bool fc_padding = PD_ConfigUseFcPadding(config);
+  EXPECT_FALSE(fc_padding);
+
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_ConfigIrOptim(config);
+  EXPECT_TRUE(ir_optim);
+
+#ifndef PADDLE_WITH_LITE
+  PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0,
+                            nullptr);
+  bool lite_enabled = PD_ConfigLiteEngineEnabled(config);
+  EXPECT_TRUE(lite_enabled);
+#endif
+
+  PD_ConfigSwitchIrDebug(config, TRUE);
+#ifdef PADDLE_WITH_MKLDNN
+  const char* ops_name = "conv_2d";
+  PD_ConfigEnableMKLDNN(config);
+  PD_ConfigSetMkldnnOp(config, 1, &ops_name);
+  PD_ConfigSetMkldnnCacheCapacity(config, 100);
+  bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enabled);
+
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config);
+  EXPECT_EQ(cpu_threads, 10);
+
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(mkldnn_qt_enabled);
+
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetBfloat16Op(config, 1, &ops_name);
+  bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config);
+  EXPECT_TRUE(mkldnn_bf16_enabled);
+#endif
+
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_enabled = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_enabled);
+
+  PD_ConfigEnableProfile(config);
+  bool profile_enabled = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profile_enabled);
+
+  PD_ConfigDisableGlogInfo(config);
+  bool glog_diabled = PD_ConfigGlogInfoDisabled(config);
+  EXPECT_TRUE(glog_diabled);
+
+  PD_ConfigSetInvalid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+
+  PD_ConfigPartiallyRelease(config);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
new file mode 100644
index 0000000000000..f4017fc5a7f34
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void PD_run() {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<float> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuFloat(tensor, input.data());
+  PD_TensorDataFloat(tensor, &place, &size);
+  PD_TensorMutableDataFloat(tensor, place);
+
+  PD_TwoDimArraySize lod;
+  lod.size = 0;
+  lod.data = NULL;
+  PD_TensorSetLod(tensor, &lod);
+
+  PD_PredictorRun(predictor);
+
+  std::vector<float> out_data;
+  PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  int32_t out_num = std::accumulate(output_shape->data,
+                                    output_shape->data + output_shape->size, 1,
+                                    std::multiplies<int32_t>());
+  out_data.resize(out_num);
+  PD_TensorCopyToCpuFloat(output_tensor, out_data.data());
+  LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor);
+  PD_DataType data_type = PD_TensorGetDataType(output_tensor);
+  EXPECT_EQ(data_type, PD_DATA_FLOAT32);
+
+  PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor);
+
+  PD_TwoDimArraySizeDestroy(out_lod);
+  PD_OneDimArrayInt32Destroy(output_shape);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+TEST(PD_Tensor, PD_run) { PD_run(); }
+
+TEST(PD_Tensor, int32) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int32_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt32(tensor, input.data());
+  int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT32);
+  PD_TensorCopyToCpuInt32(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, int64) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  std::vector<int64_t> input(1 * 3 * 300 * 300, 0);
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuInt64(tensor, input.data());
+  int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_INT64);
+  PD_TensorCopyToCpuInt64(tensor, input.data());
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Tensor, uint8) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(predictor, input_names->data[0]);
+  int32_t shapes[4] = {1, 3, 300, 300};
+  uint8_t input[1 * 3 * 300 * 300] = {0};
+  int32_t size;
+  PD_PlaceType place;
+  PD_TensorReshape(tensor, 4, shapes);
+  PD_TensorCopyFromCpuUint8(tensor, input);
+  uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size);
+  EXPECT_EQ(place, PD_PLACE_CPU);
+  EXPECT_EQ(size, 1 * 3 * 300 * 300);
+  uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place);
+  EXPECT_EQ(data_ptr, mutable_data_ptr);
+
+  PD_DataType data_type = PD_TensorGetDataType(tensor);
+  EXPECT_EQ(data_type, PD_DATA_UINT8);
+  PD_TensorCopyToCpuUint8(tensor, input);
+
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  PD_PredictorDestroy(predictor);
+}
+
+std::string read_file(std::string filename) {
+  std::ifstream file(filename);
+  return std::string((std::istreambuf_iterator<char>(file)),
+                     std::istreambuf_iterator<char>());
+}
+
+TEST(PD_Tensor, from_buffer) {
+  PD_Config* config = PD_ConfigCreate();
+  std::string prog_file = FLAGS_infer_model + "/__model__";
+  std::string params_file = FLAGS_infer_model + "/__params__";
+
+  std::string prog_str = read_file(prog_file);
+  std::string params_str = read_file(params_file);
+
+  PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(),
+                          params_str.c_str(), params_str.size());
+
+  bool model_from_memory = PD_ConfigModelFromMemory(config);
+  EXPECT_TRUE(model_from_memory);
+  PD_ConfigDestroy(config);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
new file mode 100644
index 0000000000000..8951c446b1f83
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+typedef struct RunParameter {
+  PD_Predictor* predictor;
+  int32_t* shapes;
+  size_t shape_size;
+  float* input_data;
+  int32_t out_size;
+  float* out_data;
+  int32_t thread_index;
+} RunParameter;
+
+void* run(void* thread_param) {
+  struct RunParameter* param = (struct RunParameter*)thread_param;
+  LOG(INFO) << "Thread " << param->thread_index << " start run!";
+  PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor);
+  PD_Tensor* tensor =
+      PD_PredictorGetInputHandle(param->predictor, input_names->data[0]);
+  PD_TensorReshape(tensor, param->shape_size, param->shapes);
+  PD_TensorCopyFromCpuFloat(tensor, param->input_data);
+  PD_PredictorRun(param->predictor);
+  PD_OneDimArrayCstr* output_names =
+      PD_PredictorGetOutputNames(param->predictor);
+  PD_Tensor* output_tensor =
+      PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]);
+  PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor);
+  param->out_size = 1;
+  for (size_t index = 0; index < output_shape->size; ++index) {
+    param->out_size = param->out_size * output_shape->data[index];
+  }
+  PD_OneDimArrayInt32Destroy(output_shape);
+  param->out_data =
+      reinterpret_cast<float*>(malloc(param->out_size * sizeof(float)));
+  PD_TensorCopyToCpuFloat(output_tensor, param->out_data);
+  PD_TensorDestroy(output_tensor);
+  PD_OneDimArrayCstrDestroy(output_names);
+  PD_TensorDestroy(tensor);
+  PD_OneDimArrayCstrDestroy(input_names);
+  LOG(INFO) << "Thread " << param->thread_index << " end run!";
+  return NULL;
+}
+void threads_run(int thread_num) {
+  auto model_dir = FLAGS_infer_model;
+  PD_Config* config = PD_ConfigCreate();
+  PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(),
+                    (model_dir + "/__params__").c_str());
+  PD_Predictor* predictor = PD_PredictorCreate(config);
+
+  pthread_t* threads =
+      reinterpret_cast<pthread_t*>(malloc(thread_num * sizeof(pthread_t)));
+  RunParameter* params = reinterpret_cast<RunParameter*>(
+      malloc(thread_num * sizeof(RunParameter)));
+  int32_t shapes[4] = {1, 3, 300, 300};
+  float* input =
+      reinterpret_cast<float*>(malloc(1 * 3 * 300 * 300 * sizeof(float)));
+  memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float));
+  for (int i = 0; i < thread_num; ++i) {
+    params[i].predictor = PD_PredictorClone(predictor);
+    params[i].shapes = shapes;
+    params[i].shape_size = 4;
+    params[i].input_data = input;
+    params[i].out_size = 0;
+    params[i].out_data = NULL;
+    params[i].thread_index = i;
+    pthread_create(&(threads[i]), NULL, run, (params + i));
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+  ASSERT_GT(params[0].out_size, 0);
+
+  for (int i = 1; i < thread_num; ++i) {
+    ASSERT_EQ(params[i].out_size, params[0].out_size);
+    for (int j = 0; j < params[i].out_size; ++j) {
+      ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]);
+    }
+  }
+  for (int i = 0; i < thread_num; ++i) {
+    PD_PredictorDestroy(params[i].predictor);
+    free(params[i].out_data);
+  }
+  free(input);
+  free(params);
+  free(threads);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); }
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
new file mode 100644
index 0000000000000..11de1a5a6fab4
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+void predictor_run() {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+
+  PD_Predictor *predictor = PD_PredictorCreate(config);
+  PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data");
+
+  const int batch_size = 1;
+  const int channels = 3;
+  const int height = 318;
+  const int width = 318;
+  float *input = new float[batch_size * channels * height * width]();
+
+  int32_t shape[4] = {batch_size, channels, height, width};
+  PD_TensorReshape(tensor, 4, shape);
+  PD_TensorCopyFromCpuFloat(tensor, input);
+  EXPECT_TRUE(PD_PredictorRun(predictor));
+
+  delete[] input;
+  PD_TensorDestroy(tensor);
+  PD_PredictorDestroy(predictor);
+}
+
+TEST(PD_PredictorRun, predictor_run) { predictor_run(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(PD_Config, profile_mkldnn) {
+  std::string model_dir = FLAGS_infer_model;
+  std::string prog_file = model_dir + "/model";
+  std::string params_file = model_dir + "/params";
+  PD_Config *config = PD_ConfigCreate();
+  PD_ConfigDisableGpu(config);
+  PD_ConfigSetCpuMathLibraryNumThreads(config, 10);
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigEnableMKLDNN(config);
+  bool mkldnn_enable = PD_ConfigMkldnnEnabled(config);
+  EXPECT_TRUE(mkldnn_enable);
+  PD_ConfigEnableMkldnnQuantizer(config);
+  bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config);
+  EXPECT_TRUE(quantizer_enable);
+  PD_ConfigEnableMkldnnBfloat16(config);
+  PD_ConfigSetMkldnnCacheCapacity(config, 0);
+  PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str());
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
new file mode 100644
index 0000000000000..f4fd04e85840d
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/capi_exp/pd_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+#ifdef PADDLE_WITH_XPU
+TEST(PD_Config, use_xpu) {
+  std::string model_dir = FLAGS_infer_model + "/mobilenet";
+  PD_Config *config = PD_Config();
+  PD_ConfigSwitchIrDebug(config, TRUE);
+  PD_ConfigSetModelDir(config, model_dir.c_str());
+  PD_ConfigSetOptimCacheDir(config,
+                            (FLAGS_infer_model + "/OptimCacheDir").c_str());
+  const char *model_dir_ = PD_ConfigGetModelDir(config);
+  LOG(INFO) << model_dir_;
+  PD_ConfigEnableXpu(config, 0xfffc00);
+  bool use_xpu = PD_ConfigUseXpu(config);
+  EXPECT_TRUE(use_xpu);
+  int32_t device_id = PD_ConfigXpuDeviceId(config);
+  EXPECT_EQ(devive_id, 0);
+  PD_ConfigSwitchIrOptim(config, TRUE);
+  bool ir_optim = PD_IrOptim(config);
+  EXPECT_TRUE(ir_optim);
+  PD_ConfigEnableMemoryOptim(config);
+  bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config);
+  EXPECT_TRUE(memory_optim_enable);
+  PD_ConfigEnableProfile(config);
+  bool profiler_enable = PD_ConfigProfileEnabled(config);
+  EXPECT_TRUE(profiler_enable);
+  PD_SetInValid(config);
+  bool is_valid = PD_ConfigIsValid(config);
+  EXPECT_FALSE(is_valid);
+  PD_ConfigDestroy(config);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index c1b12f5c0ecbb..b1a45afa99d9a 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -54,6 +54,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   size_t avail, total, actual_avail, actual_total;
   bool is_limited = platform::RecordedCudaMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
+  size_t allocated = total - avail;
 
   std::string err_msg;
   if (is_limited) {
@@ -68,13 +69,14 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) {
 
   PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
       "\n\nOut of memory error on GPU %d. "
-      "Cannot allocate %s memory on GPU %d, "
+      "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
       "available memory is only %s.\n\n"
       "Please check whether there is any other process using GPU %d.\n"
       "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
       "2. If no, please decrease the batch size of your model. %s\n\n",
       place_.device, string::HumanReadableSize(size), place_.device,
-      string::HumanReadableSize(avail), place_.device, err_msg));
+      string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+      place_.device, err_msg));
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index c733ba5c68c9b..0d7065d8bfba0 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -125,6 +125,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
     size_t avail, total, actual_avail, actual_total;
     bool is_limited = platform::RecordedCudaMemGetInfo(
         &avail, &total, &actual_avail, &actual_total, gpu_id_);
+    size_t allocated = total - avail;
 
     std::string err_msg;
     if (is_limited) {
@@ -139,7 +140,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "\n\nOut of memory error on GPU %d. "
-        "Cannot allocate %s memory on GPU %d, "
+        "Cannot allocate %s memory on GPU %d, %s memory has been allocated and "
         "available memory is only %s.\n\n"
         "Please check whether there is any other process using GPU %d.\n"
         "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
@@ -150,8 +151,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
         gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(avail), gpu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
+        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
   }
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7b17899d3da23..6e11c64afc4bd 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
+if (WITH_DLNNE)
+    add_subdirectory(dlnne)
+endif()
+
 if (WITH_LITE)
     add_subdirectory(lite)
 endif()
@@ -195,3 +199,7 @@ endif()
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
 endif()
+
+if (WITH_GPU OR WITH_ASCEND_CL)
+cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 1cac9ed9f1dd0..055909ba6f486 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -162,6 +162,12 @@ Sigmoid Activation Operator
 
 )DOC";
 
+UNUSED constexpr char SiluDoc[] = R"DOC(
+Silu Activation Operator
+
+$$out = x * \\frac{1}{1 + e^{-x}}$$
+)DOC";
+
 UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
@@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation.
 };
 
 REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc);
+REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc);
 REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc);
 REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc);
 REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index fb9f956f17c0b..7245dea9cf949 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+// silu(x) = x / (1 + exp(-x))
+template <typename T>
+struct SiluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    out.device(d) = x * temp;
+  }
+};
+
+// silu'(x) = (1 / (1 + e^{-x}))  * (1 + out * e^{-x}))
+template <typename T>
+struct SiluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) + (-x).exp();  // 1+e^(-x)
+    auto temp2 = x * (-x).exp();                  // x*e^(-x)
+    dx.device(d) = dout * ((static_cast<T>(1) / temp1) *
+                           (static_cast<T>(1) + (temp2 / temp1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
 // For numerical stability, we can use the log-sum-exp trick:
 // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
@@ -2129,6 +2154,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
   __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
   __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc
new file mode 100644
index 0000000000000..181dd6eabe22d
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class AllocFloatStatusOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus",
+                   "alloc_float_status");
+    ctx->SetOutputDim("FloatStatus", {8});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("FloatStatus",
+              "(Tensor) of shape {8} that holds the float status.");
+    AddComment(R"DOC(
+      Produces a float Tensor that holds the float status
+)DOC");
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Operator alloc_float_status is not supported on CPU"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(alloc_float_status,
+                       ops::AllocFloatStatusKernel<CPU, float>);
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
new file mode 100644
index 0000000000000..fe5b08af52a62
--- /dev/null
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AllocFloatStatusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
+    float_status->mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    alloc_float_status,
+    ops::AllocFloatStatusKernel<paddle::platform::NPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
index 9d78936ad5f7f..c7520dbd34f6a 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
@@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Scale",
              "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
              "operator.");
+#ifdef PADDLE_WITH_ASCEND_CL
+    AddInput("FloatStatus",
+             "(Tensor) 1-dim tensor of shape [8], allocated by "
+             "alloc_float_status op")
+        .AsDispensable();
+#endif
     AddOutput("Out",
               "(Tensors) The scaled output tensor of "
               "check_finite_and_unscale operator.")
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 21968dcb05dd1..8fd45326e4ec6 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -24,12 +24,19 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA.
+// On NPU, we do not really check the data of input tensors,
+// but use NPUGetFloatStatus to check whether the nan/inf occurs on device,
+// and clear it after this op.
+// Which may leads to wrong result if the input tensors is not calculated
+// on NPU device, but got from other way, for example, feeding.
 template <typename T>
 class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     const auto xs = ctx.MultiInput<framework::Tensor>("X");
     const auto* scale = ctx.Input<framework::Tensor>("Scale");
+    const auto* float_status = ctx.Input<framework::Tensor>("FloatStatus");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
     auto* found_inf = ctx.Output<framework::Tensor>("FoundInfinite");
 
@@ -56,58 +63,60 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     runner_inverse.Run(stream);
     tmp_inverse_out = &inverse_out;
 
-    size_t x_size = xs.size();
-    for (size_t i = 0; i < x_size; ++i) {
+    // NOTE(zhiqiu):
+    Tensor tmp;
+    tmp.mutable_data<float>({8}, ctx.GetPlace());
+
+    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
+    // tmp is only placeholder.
+    auto runner_float_status =
+        NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
+                    {{"message", std::string("check_nan_and_inf")}});
+    runner_float_status.Run(stream);
+
+    Tensor sum;
+    sum.mutable_data<float>({1}, ctx.GetPlace());
+    auto runner_reduce_sum =
+        NpuOpRunner("ReduceSumD", {*float_status}, {sum},
+                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
+    runner_reduce_sum.Run(stream);
+
+    std::vector<float> sum_vec;
+    TensorToVector(
+        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+        &sum_vec);
+    found_inf_data = (sum_vec[0] > 1);
+
+    VLOG(4) << "found_inf_data:" << found_inf_data;
+
+    for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
-
-      // step2: CheckNumerics
-      // CheckNumerics runs on the Ascend AI CPU, which delivers poor
-      // performance.
-      Tensor check_xout(x->type());
-      check_xout.Resize(x->dims());
-      check_xout.mutable_data<T>(ctx.GetPlace());
-      try {
-        auto runner_checknumerics =
-            NpuOpRunner("CheckNumerics", {*x}, {check_xout},
-                        {{"message", std::string("check_nan_and_inf")}});
-        runner_checknumerics.Run(stream);
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .Wait();
-      } catch (platform::EnforceNotMet& exception) {
-        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-        found_inf_data = true;
-      } catch (...) {
-        LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!";
-        found_inf_data = true;
-      }
-
       if (!found_inf_data) {
         // MatMul
         auto runner_matmul =
             NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
         runner_matmul.Run(stream);
-      } else {
-        // ZerosLike
-        auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {});
-        runner_zeroslike.Run(stream);
-      }  // end if
-    }    // end for
+      }
+    }
 
     // set found_inf to true
-    if (found_inf_data) {
-      Tensor found_inf_tensor;
-      found_inf_tensor.Resize({1});
-      bool* is_found_inf =
-          found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-      *is_found_inf = true;
-
-      framework::TensorCopy(
-          found_inf_tensor, ctx.GetPlace(),
-          ctx.template device_context<platform::DeviceContext>(), found_inf);
-      ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-    }
+    VLOG(4) << "found overflow:" << found_inf_data;
+    Tensor found_inf_tensor;
+    found_inf_tensor.Resize({1});
+    bool* is_found_inf =
+        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
+    *is_found_inf = found_inf_data;
+
+    framework::TensorCopy(
+        found_inf_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), found_inf);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
+
+    auto runner_clear_status =
+        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
+    runner_clear_status.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 444c24b826b1b..41dc87ac1ba47 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType<T>;
 template <typename T>
 using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
 
+template <typename T, framework::DataLayout layout>
+static __global__ void BNForwardInference(
+    const T *x, const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *variance, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    BatchNormParamType<T> inv_var = 1 / sqrt(variance[c] + epsilon);
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_var + bias[c]);
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
+    const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias, const int C, const int N, const int HxW,
+    const double epsilon, double exponentialAverageFactor, T *y,
+    BatchNormParamType<T> *mean, BatchNormParamType<T> *variance,
+    BatchNormParamType<T> *save_mean,
+    BatchNormParamType<T> *save_inv_variance) {
+  int outer_size = C;
+  int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> variance_val;
+  __shared__ BatchNormParamType<T> inv_var_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> x_square_sum = static_cast<BatchNormParamType<T>>(0);
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_i = static_cast<BatchNormParamType<T>>(x[index]);
+      x_sum += x_i;
+      x_square_sum += x_i * x_i;
+    }
+    x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+    x_square_sum =
+        BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      mean_val = x_sum / inner_size;
+      variance_val = x_square_sum / inner_size - mean_val * mean_val;
+      inv_var_val = 1 / sqrt(variance_val + epsilon);
+
+      if (save_mean && save_inv_variance) {
+        save_mean[i] = mean_val;
+        save_inv_variance[i] = inv_var_val;
+      }
+      mean[i] = (1 - exponentialAverageFactor) * mean_val +
+                exponentialAverageFactor * mean[i];
+      variance[i] = (1 - exponentialAverageFactor) * variance_val +
+                    exponentialAverageFactor * variance[i];
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> x_sub_mean =
+          static_cast<BatchNormParamType<T>>(x[index]) - mean_val;
+      y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i];
+    }
+  }
+}
+
 template <typename T>
 class BatchNormKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -80,8 +157,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
 
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         test_mode ||
@@ -111,14 +192,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t data_desc_;
-    miopenTensorDescriptor_t bn_param_desc_;
-    miopenBatchNormMode_t mode_;
-
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
     cudnnTensorDescriptor_t data_desc_;
     cudnnTensorDescriptor_t bn_param_desc_;
@@ -138,7 +220,8 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
 #ifdef PADDLE_WITH_HIP
-    mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
     if (FLAGS_cudnn_batchnorm_spatial_persistent) {
       mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -161,14 +244,15 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     }
 
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-        data_desc_, CudnnDataType<T>::type,
-        x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-        const_cast<int *>(strides.data())));
-    // Note: PERSISTENT not implemented for inference
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDeriveBNTensorDescriptor(
-            bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// Note: PERSISTENT not implemented for inference
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(
+//         bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_));
 #else
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
         data_desc_, CudnnDataType<T>::type,
@@ -226,28 +310,53 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
               C, est_var->dims()[0], est_var->dims()));
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenBatchNormalizationForwardInference(
-              handle, miopenBNSpatial,
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kOne())),
-              const_cast<void *>(
-                  static_cast<const void *>(CudnnDataType<T>::kZero())),
-              data_desc_,
-              static_cast<const void *>(transformed_x.template data<T>()),
-              data_desc_,
-              static_cast<void *>(
-                  transformed_y.template mutable_data<T>(ctx.GetPlace())),
-              bn_param_desc_,
-              const_cast<void *>(static_cast<const void *>(
-                  scale->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  bias->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_mean->template data<BatchNormParamType<T>>())),
-              const_cast<void *>(static_cast<const void *>(
-                  est_var->template data<BatchNormParamType<T>>())),
-              epsilon));
+      const int block_size = 256;
+      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
+      if (compute_format == DataLayout::kNCHW) {
+        BNForwardInference<
+            T,
+            DataLayout::kNCHW><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      } else {
+        BNForwardInference<
+            T,
+            DataLayout::kNHWC><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+            transformed_x.template data<T>(),
+            est_mean->template data<BatchNormParamType<T>>(),
+            est_var->template data<BatchNormParamType<T>>(),
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+            epsilon, transformed_y.template data<T>());
+      }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardInference(
+//         handle, miopenBNSpatial,
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_mean->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             est_var->template data<BatchNormParamType<T>>())),
+//         epsilon));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnBatchNormalizationForwardInference(
@@ -365,34 +474,66 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationForwardTraining(
-                  handle, mode_, const_cast<void *>(static_cast<const void *>(
-                                     CudnnDataType<T>::kOne())),
-                  const_cast<void *>(
-                      static_cast<const void *>(CudnnDataType<T>::kZero())),
-                  data_desc_,
-                  static_cast<const void *>(transformed_x.template data<T>()),
-                  data_desc_,
-                  static_cast<void *>(
-                      transformed_y.template mutable_data<T>(ctx.GetPlace())),
-                  bn_param_desc_,
-                  const_cast<void *>(static_cast<const void *>(
-                      scale->template data<BatchNormParamType<T>>())),
-                  const_cast<void *>(static_cast<const void *>(
-                      bias->template data<BatchNormParamType<T>>())),
-                  this_factor,
-                  static_cast<void *>(
-                      mean_out->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(variance_out->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace())),
-                  epsilon,
-                  static_cast<void *>(
-                      saved_mean->template mutable_data<BatchNormParamType<T>>(
-                          ctx.GetPlace())),
-                  static_cast<void *>(saved_variance->template mutable_data<
-                                      BatchNormParamType<T>>(ctx.GetPlace()))));
+          const int num = transformed_x.numel();
+          const int block = 256;
+          const int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+          const int max_blocks = std::max(max_threads / block, 1);
+          const int grid = std::min(C, max_blocks);
+          if (compute_format == DataLayout::kNCHW) {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNCHW><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          } else {
+            BNForwardTraining<
+                T, block,
+                DataLayout::kNHWC><<<grid, block, 0, dev_ctx.stream()>>>(
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(),
+                bias->template data<BatchNormParamType<T>>(), C, N, H * W * D,
+                epsilon, this_factor, transformed_y.template data<T>(),
+                mean_out->template data<BatchNormParamType<T>>(),
+                variance_out->template data<BatchNormParamType<T>>(),
+                saved_mean->template data<BatchNormParamType<T>>(),
+                saved_variance->template data<BatchNormParamType<T>>());
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationForwardTraining(
+//         handle, mode_, const_cast<void *>(static_cast<const void *>(
+//                            CudnnDataType<T>::kOne())),
+//         const_cast<void *>(
+//             static_cast<const void *>(CudnnDataType<T>::kZero())),
+//         data_desc_,
+//         static_cast<const void *>(transformed_x.template data<T>()),
+//         data_desc_,
+//         static_cast<void *>(
+//             transformed_y.template mutable_data<T>(ctx.GetPlace())),
+//         bn_param_desc_,
+//         const_cast<void *>(static_cast<const void *>(
+//             scale->template data<BatchNormParamType<T>>())),
+//         const_cast<void *>(static_cast<const void *>(
+//             bias->template data<BatchNormParamType<T>>())),
+//         this_factor,
+//         static_cast<void *>(
+//             mean_out->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(variance_out->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace())),
+//         epsilon,
+//         static_cast<void *>(
+//             saved_mean->template mutable_data<BatchNormParamType<T>>(
+//                 ctx.GetPlace())),
+//         static_cast<void *>(saved_variance->template mutable_data<
+//                             BatchNormParamType<T>>(ctx.GetPlace()))));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationForwardTraining(
@@ -423,11 +564,12 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
           ctx, &transformed_y, y);
     }
 #ifdef PADDLE_WITH_HIP
-    // clean when exit.
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
     // clean when exit.
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -439,7 +581,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void KeBNBackwardScaleBias(
+static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias(
     const T *dy, const T *x, const BatchNormParamType<T> *mean,
     const BatchNormParamType<T> *variance, const double epsilon, const int N,
     const int C, const int HxW, BatchNormParamType<T> *dscale,
@@ -526,13 +668,97 @@ class InplaceHelper {
 };
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-static __global__ void BNBackwardData(const T *dy,
-                                      const BatchNormParamType<T> *scale,
-                                      const BatchNormParamType<T> *mean,
-                                      const T *x,
-                                      const BatchNormParamType<T> *variance,
-                                      const int C, const int N, const int HxW,
-                                      T *dx) {
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward(
+    const T *dy, const T *x, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *saved_mean,
+    const BatchNormParamType<T> *saved_inv_variance, const int C, const int N,
+    const int HxW, const double epsilon, T *dx, BatchNormParamType<T> *dscale,
+    BatchNormParamType<T> *dbias) {
+  const int outer_size = C;
+  const int inner_size = N * HxW;
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  __shared__ typename BlockReduce::TempStorage mean_storage;
+  __shared__ typename BlockReduce::TempStorage variance_storeage;
+  __shared__ BatchNormParamType<T> inv_var_val;
+  __shared__ BatchNormParamType<T> mean_val;
+  __shared__ BatchNormParamType<T> dscale_val;
+  __shared__ BatchNormParamType<T> dbias_val;
+
+  for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
+    BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
+    BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
+
+    if (saved_mean && saved_inv_variance) {
+      if (threadIdx.x == 0) {
+        inv_var_val = saved_inv_variance[i];
+        mean_val = saved_mean[i];
+      }
+    } else {
+      BatchNormParamType<T> x_sum = static_cast<BatchNormParamType<T>>(0);
+      BatchNormParamType<T> x_square_sum =
+          static_cast<BatchNormParamType<T>>(0);
+
+      for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+        const int index = layout == framework::DataLayout::kNCHW
+                              ? (j / HxW * C + i) * HxW + j % HxW
+                              : j * outer_size + i;
+        BatchNormParamType<T> x_i =
+            static_cast<BatchNormParamType<T>>(x[index]);
+        x_sum += x_i;
+        x_square_sum += x_i * x_i;
+      }
+      x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum());
+      x_square_sum =
+          BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum());
+      if (threadIdx.x == 0) {
+        mean_val = x_sum / inner_size;
+        inv_var_val =
+            1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon);
+      }
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      BatchNormParamType<T> dy_i =
+          static_cast<BatchNormParamType<T>>(dy[index]);
+      ds_sum +=
+          dy_i * (static_cast<BatchNormParamType<T>>(x[index]) - mean_val);
+      db_sum += dy_i;
+    }
+    ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
+    db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale_val = ds_sum * inv_var_val;
+      dbias_val = db_sum;
+      dscale[i] = dscale_val;
+      dbias[i] = dbias_val;
+    }
+    __syncthreads();
+
+    for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
+      const int index = layout == framework::DataLayout::kNCHW
+                            ? (j / HxW * C + i) * HxW + j % HxW
+                            : j * outer_size + i;
+      dx[index] = scale[i] * inv_var_val *
+                  (static_cast<BatchNormParamType<T>>(dy[index]) -
+                   dbias_val / static_cast<BatchNormParamType<T>>(inner_size) -
+                   (static_cast<BatchNormParamType<T>>(x[index]) - mean_val) *
+                       inv_var_val * dscale_val / inner_size);
+    }
+  }
+}
+
+template <typename T, int BlockDim, framework::DataLayout layout>
+static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData(
+    const T *dy, const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *mean, const T *x,
+    const BatchNormParamType<T> *variance, const int C, const int N,
+    const int HxW, T *dx) {
   const int outer_size = C;
   const int inner_size = N * HxW;
   typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
@@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy,
       dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
     }
     __syncthreads();
-
     for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
       const int index = layout == framework::DataLayout::kNCHW
                             ? (j / HxW * C + i) * HxW + j % HxW
@@ -668,8 +893,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
     auto dtype = platform::CudnnDataType<T>::type;
     const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
 #ifdef PADDLE_WITH_HIP
-    // HIP do not support compute format of NHWC
-    auto compute_format = DataLayout::kNCHW;
+    auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC
+                                                           : DataLayout::kNCHW;
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// HIP do not support compute format of NHWC
+// auto compute_format = DataLayout::kNCHW;
 #else
     const bool fast_nhwc_batch_norm =
         dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent &&
@@ -714,7 +943,11 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     const int num = transformed_x.numel();
+#ifdef HIPCC
+    const int block = 256;
+#else
     const int block = 512;
+#endif
     int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
     const int max_blocks = std::max(max_threads / block, 1);
     int grid1 = (num + block - 1) / block;
@@ -734,14 +967,15 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 
 // ------------------- cudnn descriptors ---------------------
 #ifdef PADDLE_WITH_HIP
-      miopenTensorDescriptor_t data_desc_;
-      miopenTensorDescriptor_t bn_param_desc_;
-      miopenBatchNormMode_t mode_;
-
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// miopenTensorDescriptor_t data_desc_;
+// miopenTensorDescriptor_t bn_param_desc_;
+// miopenBatchNormMode_t mode_;
+
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_));
 #else
       cudnnTensorDescriptor_t data_desc_;
       cudnnTensorDescriptor_t bn_param_desc_;
@@ -759,7 +993,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
       epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 #ifdef PADDLE_WITH_HIP
-      mode_ = miopenBNSpatial;
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
       if (FLAGS_cudnn_batchnorm_spatial_persistent) {
         mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
@@ -771,13 +1006,14 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
-          data_desc_, CudnnDataType<T>::type,
-          x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
-          const_cast<int *>(strides.data())));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
-                                                            data_desc_, mode_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
+//     data_desc_, CudnnDataType<T>::type,
+//     x_dims.size() > 3 ? x_dims.size() : 4, const_cast<int *>(dims.data()),
+//     const_cast<int *>(strides.data())));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_,
+//                                                       data_desc_, mode_));
 #else
       PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
           data_desc_, CudnnDataType<T>::type,
@@ -871,20 +1107,49 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
 #endif  // CUDNN_VERSION_MIN(7, 4, 1)
         if (!called) {
 #ifdef PADDLE_WITH_HIP
-          PADDLE_ENFORCE_CUDA_SUCCESS(
-              platform::dynload::miopenBatchNormalizationBackward(
-                  dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
-                  CudnnDataType<T>::kZero(), data_desc_,
-                  transformed_x.template data<T>(), data_desc_,
-                  transformed_d_y.template data<T>(), data_desc_,
-                  transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
-                  bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
-                  d_scale->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  d_bias->template mutable_data<BatchNormParamType<T>>(
-                      ctx.GetPlace()),
-                  epsilon, saved_mean_data, saved_var_data));
+          if (compute_format == DataLayout::kNCHW) {
+            BNBackward<
+                T, block,
+                DataLayout::kNCHW><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          } else {
+            BNBackward<
+                T, block,
+                DataLayout::kNHWC><<<grid2, block, 0, dev_ctx.stream()>>>(
+                transformed_d_y.template data<T>(),
+                transformed_x.template data<T>(),
+                scale->template data<BatchNormParamType<T>>(), saved_mean_data,
+                saved_var_data, C, N, H * W * D, epsilon,
+                transformed_d_x.template data<T>(),
+                d_scale->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()),
+                d_bias->template mutable_data<BatchNormParamType<T>>(
+                    ctx.GetPlace()));
+          }
+
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenBatchNormalizationBackward(
+//         dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+//         CudnnDataType<T>::kZero(), data_desc_,
+//         transformed_x.template data<T>(), data_desc_,
+//         transformed_d_y.template data<T>(), data_desc_,
+//         transformed_d_x.template mutable_data<T>(ctx.GetPlace()),
+//         bn_param_desc_, scale->template data<BatchNormParamType<T>>(),
+//         d_scale->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         d_bias->template mutable_data<BatchNormParamType<T>>(
+//             ctx.GetPlace()),
+//         epsilon, saved_mean_data, saved_var_data));
 #else
           PADDLE_ENFORCE_CUDA_SUCCESS(
               platform::dynload::cudnnBatchNormalizationBackward(
@@ -931,11 +1196,12 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
       }
 
 #ifdef PADDLE_WITH_HIP
-      // clean when exit.
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
-          platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
+// TODO(wangran16): wait for MIOpen to improve the performance of BN
+// clean when exit.
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(data_desc_));
+// PADDLE_ENFORCE_CUDA_SUCCESS(
+//     platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_));
 #else
       // clean when exit.
       PADDLE_ENFORCE_CUDA_SUCCESS(
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index 8d34e0ba99c2c..0de0f5e450579 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -92,6 +92,7 @@ REGISTER_OP_NPU_KERNEL(
     cast, ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int16_t>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int32_t>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int64_t>,
+    ops::CastNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, bool>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, double>,
     ops::CastNPUKernel<paddle::platform::NPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index eb27df8a36757..7176a0466bb83 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer,
                   ops::ClipDoubleGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CPUDeviceContext, int64_t>);
 REGISTER_OP_CPU_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
 REGISTER_OP_VERSION(clip)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu
index d31b81c13c5cf..fd61e4ea61d4f 100644
--- a/paddle/fluid/operators/clip_op.cu
+++ b/paddle/fluid/operators/clip_op.cu
@@ -17,8 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipKernel<paddle::platform::CUDADeviceContext, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ClipGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 977a208d20e78..3f210219608fb 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
     set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 
 if(WITH_NCCL OR WITH_RCCL)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL)
     op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
-if(WITH_ASCEND)
-    op_library(gen_nccl_id_op)
-    op_library(c_gen_nccl_id_op)
-endif()
-
-
 if(WITH_GLOO)
     set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
 endif()
@@ -35,5 +29,38 @@ if(WITH_XPU_BKCL)
     op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
+if(WITH_ASCEND_CL)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+endif()
+
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
+
+if(WITH_ASCEND_CL)
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
+        dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
+        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
+        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
+        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
+        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
+        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
+        DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+endif()
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 4111a19c5ebc8..c4e779698ccca 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allgather result");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
new file mode 100644
index 0000000000000..e7f05549d9efe
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t send_numel = in->numel();
+    void *send_buff = reinterpret_cast<void *>(const_cast<T *>(in->data<T>()));
+    void *recv_buff = reinterpret_cast<void *>(out->data<T>());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    VLOG(3) << "begin hccl allgather, parameter is: "
+            << ", group is " << group << ", ring_id is " << ring_id
+            << ", nranks is " << nranks;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype, comm->comm(),
+        reinterpret_cast<void *>(stream)));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
+                       ops::CAllGatherOpASCENDKernel<int>,
+                       ops::CAllGatherOpASCENDKernel<float>,
+                       ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
new file mode 100644
index 0000000000000..4c7dfc4aad7d0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index 835b49e57bc09..8bdbdfac8ffd1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Max"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp,
-                             ops::CAllReduceMaxOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_max, ops::CAllReduceOp, ops::CAllReduceMaxOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMaxInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_max,
                        ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
new file mode 100644
index 0000000000000..4dece4a3721ff
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_max, ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
new file mode 100644
index 0000000000000..b7fd2739d5118
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -0,0 +1,188 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index efc19659c83ec..9d913b12b1376 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -37,14 +37,19 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Min"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp,
-                             ops::CAllReduceMinOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_min, ops::CAllReduceOp, ops::CAllReduceMinOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceMinInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_min,
                        ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
new file mode 100644
index 0000000000000..48e1d2eeb58c5
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_min, ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index ab1cc508fdf69..0eaa377869ef6 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -19,9 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL)
+    defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
@@ -38,6 +40,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -113,6 +119,73 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl allreduce, parameter is: "
+            << "input num: " << numel << "dtype: " << dtype
+            << "hccl_red_type: " << hccl_red_type << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpXPUKernel : public framework::OpKernel<T> {
  public:
@@ -240,10 +313,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allreduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
         .SetDefault(false);
+    AddAttr<bool>(
+        "use_model_parallel",
+        "(bool default false) use this op with model parallel mode. In model "
+        "parallel mode, the backward is c_identity which returns itself for "
+        "c_allreduce_sum.")
+        .SetDefault(false);
     AddComment(string::Sprintf(R"DOC(
 CAllReduce %s Operator
 
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 5ab07ef026bac..3ad078e1c8ff0 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -37,14 +37,19 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Prod"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp,
-                             ops::CAllReduceProdOpMaker);
+REGISTER_OPERATOR(
+    c_allreduce_prod, ops::CAllReduceOp, ops::CAllReduceProdOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
+    ops::AllreduceProdInplaceInferer)
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
                        ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
new file mode 100644
index 0000000000000..f3d14afe0a1bc
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_prod, ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 68061e6ae6bea..18c317506c06e 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("c_allreduce_sum");
+    bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel"));
+    if (use_mp) {
+      retv->SetType("c_identity");
+    } else {
+      retv->SetType("c_allreduce_sum");
+    }
     retv->SetInput("X", this->OutputGrad("Out"));
     retv->SetOutput("Out", this->InputGrad("X"));
     retv->SetAttrMap(this->Attrs());
@@ -49,6 +54,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker {
   std::string GetName() const override { return "Sum"; }
 };
 
+DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -58,7 +65,7 @@ namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp,
                   ops::CAllReduceSumOpGradMaker<paddle::framework::OpDesc>,
                   ops::CAllReduceSumOpGradMaker<paddle::imperative::OpBase>,
-                  ops::CAllReduceSumOpMaker);
+                  ops::CAllReduceSumOpMaker, ops::AllreduceSumInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
                        ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
new file mode 100644
index 0000000000000..b66e2e1968908
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(
+    c_allreduce_sum, ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000..f1bf9683e3559
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 3.0);
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  // only support one device, if more than one device, use first default
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 1; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLAllReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 928fa8549ffb9..271d543eb2364 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0);
     AddAttr<int>("root", "(int default 0) root id for broadcasting.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
new file mode 100644
index 0000000000000..a60ba86572822
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int root = ctx.Attr<int>("root");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+
+    VLOG(3) << "begin hccl broadcast, parameter is: "
+            << "root " << root << ", group is " << group
+            << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+            << framework::product(out->dims());
+
+    dev_ctx->Wait();
+
+    if (out != x) {
+      framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
+                            *platform::DeviceContextPool::Instance().Get(place),
+                            static_cast<framework::Tensor*>(out));
+    }
+    dev_ctx->Wait();
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel<int>,
+                       ops::CBroadcastOpASCENDKernel<int8_t>,
+                       ops::CBroadcastOpASCENDKernel<float>,
+                       ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
new file mode 100644
index 0000000000000..9e39613f3fbe3
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
new file mode 100644
index 0000000000000..7817f19bacb18
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+class CCommInitOpAscend : public framework::OperatorBase {
+ public:
+  CCommInitOpAscend(const std::string& type,
+                    const framework::VariableNameMap& inputs,
+                    const framework::VariableNameMap& outputs,
+                    const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
+    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
+    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
+    if (Attr<int>("device_id") >= 0) {
+      device_id = Attr<int>("device_id");
+    }
+    platform::HCCLCommContext::Instance().CreateHCCLComm(
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CCommInit operator
+
+Initialize collective communicatoin context within this trainer
+)DOC");
+    AddAttr<int>("rank_ids",
+                 "(int) The number of ranks of distributed trainers");
+    AddAttr<int>("rank",
+                 "(int) The rank of the trainer in distributed training.");
+    AddAttr<int>("device_id",
+                 "(int) The deivce_id on which to initialize the communicator."
+                 "Now, you only have to set this attr manually for pipeline "
+                 "training. Otherwise, make it as default.")
+        .SetDefault(-1);
+    AddAttr<int>("ring_id", "(int default 0) user specified ring id")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend,
+                  ops::CCommInitOpAscendMaker);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc
new file mode 100644
index 0000000000000..551fde2116258
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_concat "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_concat must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_concat must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_concat must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] * nranks;
+    if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CConcatOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_split");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be concated.");
+    AddOutput("Out", "(Tensor) the result of concat.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CConcat Operator
+AllGather the tensors on different trainers and concat them along the last dimension.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_concat, ops::CConcatOp,
+                  ops::CConcatOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CConcatOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CConcatOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel<float>,
+                       ops::CConcatOpCPUKernel<double>,
+                       ops::CConcatOpCPUKernel<int>,
+                       ops::CConcatOpCPUKernel<int64_t>,
+                       ops::CConcatOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
new file mode 100644
index 0000000000000..bfdc49c440aae
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -0,0 +1,110 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_concat_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    int rid = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_GE(rank, 0,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "greater than or equal to 0.",
+                          rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_concat must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_concat must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    PADDLE_ENFORCE_EQ(
+        nranks, comm->nranks(),
+        platform::errors::InvalidArgument("nranks: %s should equal to %s",
+                                          nranks, comm->nranks()));
+
+    framework::Tensor temp_out;
+    framework::DDim temp_out_dims = x->dims();
+    temp_out_dims[0] *= nranks;
+    temp_out.mutable_data<T>(temp_out_dims, place);
+    int64_t send_numel = x->numel();
+    const T* send_buff = x->data<T>();
+    T* recv_buff = temp_out.data<T>();
+    gpuStream_t stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather(
+        send_buff, recv_buff, send_numel, static_cast<ncclDataType_t>(dtype),
+        comm->comm(), stream));
+
+    std::vector<framework::Tensor> inputs;
+    int axis = x->dims().size() - 1;
+    auto out_dims = x->dims();
+    out_dims[out_dims.size() - 1] *= nranks;
+    int rows_per_tensor = x->dims()[0];
+    int offset = 0;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+      inputs.emplace_back(temp);
+      offset += rows_per_tensor;
+    }
+
+    math::ConcatFunctor<platform::CUDADeviceContext, T> functor;
+    out->mutable_data<T>(out_dims, place);
+    auto& dev_ctx2 = ctx.template device_context<platform::CUDADeviceContext>();
+    functor(dev_ctx2, inputs, axis, out);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU."));
+#endif
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel<float>,
+                        ops::CConcatOpCUDAKernel<double>,
+                        ops::CConcatOpCUDAKernel<int>,
+                        ops::CConcatOpCUDAKernel<int64_t>,
+                        ops::CConcatOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h
new file mode 100644
index 0000000000000..55a5799e37b6f
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_concat_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CConcatOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_concat for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..593eaf923a978
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc
new file mode 100644
index 0000000000000..646c27b90e17e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CIdentityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity must be non-negative.", ring_id));
+    framework::DDim dim = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) identity tensor.");
+    AddOutput("Out", "(Tensor) identity tensor.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default true) eject CUDA operations to calculation stream.")
+        .SetDefault(true);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default true) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+Identity Operator which returns a copy of itself.
+)DOC");
+  }
+};
+
+template <typename T>
+class CIdentityOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allreduce_sum");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_identity, ops::CIdentityOp,
+                  ops::CIdentityOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CIdentityOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CIdentityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel<float>,
+                       ops::CIdentityOpCPUKernel<double>,
+                       ops::CIdentityOpCPUKernel<int>,
+                       ops::CIdentityOpCPUKernel<int64_t>,
+                       ops::CIdentityOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
new file mode 100644
index 0000000000000..8ccf40e317ade
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_identity_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_identity op must be non-negative.", rid));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    TensorCopy(*x, out->place(), out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel<float>,
+                        ops::CIdentityOpCUDAKernel<double>,
+                        ops::CIdentityOpCUDAKernel<int>,
+                        ops::CIdentityOpCUDAKernel<int64_t>,
+                        ops::CIdentityOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h
new file mode 100644
index 0000000000000..ca817fb6bac0e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_identity_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CIdentityOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_identity for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
new file mode 100644
index 0000000000000..f35b4c2f70722
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_max,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
new file mode 100644
index 0000000000000..6ebb7e4c40e68
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_min,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index e537478109972..fa9fd079d8e48 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
-    defined(PADDLE_WITH_XPU_BKCL)
+    defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
@@ -42,6 +42,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/gloo_wrapper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -119,6 +123,85 @@ class CReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <ReduceType red_type, typename T>
+class CReduceOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::LoDTensor>("X");
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto place = ctx.GetPlace();
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+    int64_t numel = in->numel();
+
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    int root_id = ctx.Attr<int>("root_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int rank_id = comm->rank();
+
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
+    switch (red_type) {
+      case kRedSum:
+        hccl_red_type = HCCL_REDUCE_SUM;
+        break;
+
+      case kRedMax:
+        hccl_red_type = HCCL_REDUCE_MAX;
+        break;
+
+      case kRedMin:
+        hccl_red_type = HCCL_REDUCE_MIN;
+        break;
+
+      case kRedProd:
+        hccl_red_type = HCCL_REDUCE_PROD;
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid reduce type: %d", red_type));
+    }
+
+    VLOG(3) << "begin hccl reduce, parameter is: "
+            << "input num: " << numel << "root_id: " << root_id
+            << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type
+            << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+
+    if (rank_id != root_id) {
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+                   npu_place,
+                   reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
+                   numel * sizeof(T), stream);
+    }
+
+    out->Resize(in->dims());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
 template <ReduceType red_type, typename T>
 class CReduceOpXPUKernel : public framework::OpKernel<T> {
  public:
@@ -251,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the reduced result.");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce.")
+        .SetDefault("tag");
+#endif
     AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
     AddAttr<bool>(
         "use_calc_stream",
diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
new file mode 100644
index 0000000000000..f0b7021e7997d
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_prod,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
new file mode 100644
index 0000000000000..dd4dbbd5f3645
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+
+namespace paddle {
+namespace platform {
+struct ASCENDPlace;
+struct float16;
+}  // namespace platform
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reduce_sum,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
+                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
new file mode 100644
index 0000000000000..3683c7722ba3b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -0,0 +1,192 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  int root_id = 0;
+  attrs["root_id"] = root_id;
+
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    if (rank_id == root_id) {
+      EXPECT_EQ(out_vec[i], 3.0);
+    } else {
+      EXPECT_EQ(out_vec[i], init[i]);
+    }
+  }
+}
+
+TEST(c_reduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 2; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLReduceOp(&scope, ctx, i);
+  }
+}
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index ada1fd2b1270c..7836f11dc9b1f 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nranks",
                  "Total trainer count of the distributed training job")
         .SetDefault(1);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 366d8a3747cfb..490b152bc2d30 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
new file mode 100644
index 0000000000000..44096a82c34d6
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -0,0 +1,87 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group =
+        std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    auto out_dims = in->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's "
+                          "dim[0] (%d) should be divisible by nranks(%d)",
+                          out_dims[0], nranks));
+
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    uint64_t recv_numel = in->numel() / nranks;
+
+    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    VLOG(3) << "begin hccl reduce scatter, parameter is: "
+            << "recv_numel: " << recv_numel << "dtype: " << dtype
+            << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(),
+        reinterpret_cast<void*>(stream)));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reducescatter,
+                       ops::CReduceScatterOpAscendKernel<int8_t>,
+                       ops::CReduceScatterOpAscendKernel<int>,
+                       ops::CReduceScatterOpAscendKernel<float>,
+                       ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
new file mode 100644
index 0000000000000..f82f050a7206f
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  int iter_num = 10;
+  for (int i = 0; i < iter_num; i++) {
+    op->Run(*scope, place);
+    ctx.Wait();
+  }
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
new file mode 100644
index 0000000000000..03046d571d0f0
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CSplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split");
+    int nranks = ctx->Attrs().Get<int>("nranks");
+    int rank = ctx->Attrs().Get<int>("rank");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument(
+                                     "The number of ranks (%d) for c_split "
+                                     "must be greater than 1.",
+                                     nranks));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for c_split must be non-negative.", ring_id));
+    PADDLE_ENFORCE_GE(
+        rank, 0, platform::errors::InvalidArgument(
+                     "The rank (%d) for c_split must be non-negative.", rank));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::InvalidArgument(
+                          "The value of rank (%d) for c_split must "
+                          "be less than that of nranks.",
+                          rank, nranks));
+
+    framework::DDim dim = ctx->GetInputDim("X");
+    dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
+    if (dim[0] < 0) dim[0] = -1;
+    ctx->SetOutputDim("Out", dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class CSplitOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("c_allgather");
+    retv->SetInput("X", this->OutputGrad("Out"));
+    retv->SetOutput("Out", this->InputGrad("X"));
+    retv->SetAttrMap(this->Attrs());
+  }
+};
+
+class CSplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be split.");
+    AddOutput("Out", "(Tensor) the result of split.");
+    AddAttr<int>("rank", "(int default 0) rank id.").SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(1);
+    AddAttr<int>("ring_id", "(int default 0) ring id.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddAttr<bool>("use_model_parallel",
+                  "(bool default false) use this op with model parallel.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CSplit Operator
+Split the tensor evenly according to its rank.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(c_split, ops::CSplitOp,
+                  ops::CSplitOpGradMaker<paddle::framework::OpDesc>,
+                  ops::CSplitOpGradMaker<paddle::imperative::OpBase>,
+                  ops::CSplitOpMaker);
+
+REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel<float>,
+                       ops::CSplitOpCPUKernel<double>,
+                       ops::CSplitOpCPUKernel<int>,
+                       ops::CSplitOpCPUKernel<int64_t>,
+                       ops::CSplitOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc
new file mode 100644
index 0000000000000..92a7f5e41b1d2
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.cu.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/fluid/operators/collective/c_split_op.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int nranks = ctx.Attr<int>("nranks");
+    int rank = ctx.Attr<int>("rank");
+    auto place = ctx.GetPlace();
+
+    PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet(
+                                   "The value of rank (%d) for c_split must be "
+                                   "greater than or equal to 0.",
+                                   rank));
+    PADDLE_ENFORCE_GE(nranks, 2,
+                      platform::errors::PreconditionNotMet(
+                          "The value of nranks (%d) for c_split must be "
+                          "greater than or equal to 2.",
+                          nranks));
+    PADDLE_ENFORCE_LT(rank, nranks,
+                      platform::errors::PreconditionNotMet(
+                          "The value of rank (%d) for c_split must be "
+                          "less than that of nranks (%d).",
+                          rank, nranks));
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> shape_refer;
+    std::vector<framework::Tensor*> results;
+    size_t numel = x->numel();
+    auto dims = x->dims();
+    numel /= nranks;
+    int axis = dims.size() - 1;
+    dims[dims.size() - 1] /= nranks;
+    for (int i = 0; i < nranks; i++) {
+      framework::Tensor* out = new framework::Tensor();
+      out->mutable_data<T>(dims, place);
+      shape_refer.emplace_back(out);
+      results.emplace_back(out);
+    }
+
+    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, *x, shape_refer, axis, &results);
+    out->mutable_data<T>(dims, place);
+    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel<float>,
+                        ops::CSplitOpCUDAKernel<double>,
+                        ops::CSplitOpCUDAKernel<int>,
+                        ops::CSplitOpCUDAKernel<int64_t>,
+                        ops::CSplitOpCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h
new file mode 100644
index 0000000000000..ea0c7fc45c66b
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_split_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSplitOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_split for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
index 700d1173e2ff6..83da712bee908 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc
@@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream()));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+
+    auto dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(place));
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream()));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
new file mode 100644
index 0000000000000..4b1f7bb340178
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(elementwise_add);
+USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
+USE_NO_KERNEL_OP(c_sync_calc_stream);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  auto y = scope->Var("Y");
+  auto tensor_y = y->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init_x;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_x.push_back(static_cast<T>(1.0));
+  }
+
+  std::vector<T> init_y;
+  for (int64_t i = 0; i < 10 * 10; ++i) {
+    init_y.push_back(static_cast<T>(2.0));
+  }
+
+  TensorFromVector(init_x, ctx, tensor_x);
+  tensor_x->Resize({10, 10});
+  TensorFromVector(init_y, ctx, tensor_y);
+  tensor_y->Resize({10, 10});
+
+  f::AttributeMap attrs;
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  // sync data
+  auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op0->Run(*scope, place);
+
+  // run
+
+  auto op =
+      f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // sync op run
+  auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  // sync op copy
+  auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}},
+                                          {{"Out", {"Out"}}}, attrs);
+  sync_op2->Run(*scope, place);
+
+  float expected = 3.0;
+
+  EXPECT_EQ(out_vec.size(), init_x.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
+  }
+}
+
+TEST(c_sync_calc_stream, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
index 95b9cd040fe94..e6f6bf5345619 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -56,9 +61,8 @@ template <typename T>
 class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-
     auto place = ctx.GetPlace();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
     int ring_id = ctx.Attr<int>("ring_id");
     auto stream =
@@ -70,6 +74,16 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
 #endif
 
+#elif defined(PADDLE_WITH_ASCEND_CL)
+    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "Sync stream op can run on npu place only for now."));
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto stream =
+        platform::HCCLCommContext::Instance().Get(ring_id, place)->stream();
+    PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream));
+
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
new file mode 100644
index 0000000000000..3915ec4fa35e8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_sync_comm_stream);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  std::cout << "rank_id:" << rank_id << std::endl;
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+    std::cout << init[0];
+  }
+  std::cout << std::endl;
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+
+  // comm sync
+
+  auto sync_op = f::OpRegistry::CreateOp(
+      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+  sync_op->Run(*scope, place);
+
+  // ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_sync_comm_stream_op, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
new file mode 100644
index 0000000000000..0cb2dd188725f
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {}
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
new file mode 100644
index 0000000000000..15940a76f7110
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
@@ -0,0 +1,350 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
new file mode 100644
index 0000000000000..1ad6f791e1fc3
--- /dev/null
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 0ae7b821617f9..39a9ed0c74ef5 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 0000000000000..69f1f4681a33d
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel<int>,
+                       ops::CRecvOpASCENDKernel<int8_t>,
+                       ops::CRecvOpASCENDKernel<float>,
+                       ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..384dfd1fc5f2d
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+  int num = atoi(getenv("DATA_SIZE"));
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
+
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Data");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+  std::vector<int> out_shape;
+  out_shape.push_back(num);
+  out_shape.push_back(num);
+  attrs["out_shape"] = out_shape;
+
+  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+  VLOG(3) << "CreateOp recv_v2";
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "Run op recv_v2";
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  EXPECT_EQ(out_vec == init, true);
+}
+
+TEST(recv_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomRecvOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f08813..c60d560e43bae 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 0000000000000..0ade090fcaac0
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    void* ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm =
+        paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument(
+                                     "The nranks must be 2, but (%d)", nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "
+            << "root " << root << ", comm: " << comm->comm()
+            << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
+        ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel<int>,
+                       ops::CSendOpASCENDKernel<int8_t>,
+                       ops::CSendOpASCENDKernel<float>,
+                       ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 0000000000000..cf01b1d0a6a1d
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {},
+                                              {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = atoi(getenv("DATA_SIZE"));
+
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank id:" << rank_id;
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "send run over";
+  ctx.Wait();
+}
+
+TEST(send_v2, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHcomSendOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 3cad86d96c26a..bf047de86fc21 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -23,29 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Functor>
-class CompareOpKernel<platform::CPUDeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEM_TYPE;
-    using Tensor = framework::Tensor;
-
-    auto* x = context.Input<Tensor>("X");
-    auto* y = context.Input<Tensor>("Y");
-    auto* z = context.Output<Tensor>("Out");
-    int axis = context.Attr<int>("axis");
-
-    if (x->numel() == 1 && y->numel() == 1) {
-      bool* z_data = z->mutable_data<bool>(context.GetPlace());
-      z_data[0] = Functor()(x->data<T>()[0], y->data<T>()[0]);
-    } else {
-      ElementwiseComputeEx<Functor, platform::CPUDeviceContext, T, bool>(
-          context, x, y, axis, Functor(), z);
-    }
-  }
-};
-
 template <typename OpComment>
 class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel {
   REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
-REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_COMPARE_OP(less_equal, "Out = X <= Y");
-REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor,
+                        paddle::operators::GreaterThanFunctor);
 REGISTER_COMPARE_OP(greater_than, "Out = X > Y");
 REGISTER_COMPARE_KERNEL(greater_than, CPU,
-                        paddle::operators::GreaterThanFunctor);
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y");
 REGISTER_COMPARE_KERNEL(greater_equal, CPU,
-                        paddle::operators::GreaterEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
 REGISTER_COMPARE_OP(equal, "Out = X == Y");
-REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
 REGISTER_COMPARE_OP(not_equal, "Out = X != Y");
-REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index b1f3063583597..3ca700e16e6e7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -14,11 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
+                        paddle::operators::GreaterEqualFunctor);
+REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
                         paddle::operators::GreaterThanFunctor);
+REGISTER_COMPARE_KERNEL(greater_than, CUDA,
+                        paddle::operators::GreaterThanFunctor,
+                        paddle::operators::LessEqualFunctor);
 REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor);
+                        paddle::operators::GreaterEqualFunctor,
+                        paddle::operators::LessThanFunctor);
+REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
+                        paddle::operators::EqualFunctor);
+REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
+                        paddle::operators::NotEqualFunctor);
diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h
index b7529e4ae632d..ff929ee7dfce7 100644
--- a/paddle/fluid/operators/controlflow/compare_op.h
+++ b/paddle/fluid/operators/controlflow/compare_op.h
@@ -68,7 +68,7 @@ struct NotEqualFunctor {
   }
 };
 
-template <typename DeviceContext, typename Functor>
+template <typename DeviceContext, typename Functor, typename InverseFunctor>
 class CompareOpKernel
     : public framework::OpKernel<typename Functor::ELEM_TYPE> {
  public:
@@ -80,21 +80,33 @@ class CompareOpKernel
     auto* y = context.Input<Tensor>("Y");
     auto* z = context.Output<Tensor>("Out");
     int axis = context.Attr<int>("axis");
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
-                                                          Functor(), z);
+
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                            Functor(), z);
+    } else {
+      ElementwiseComputeEx<InverseFunctor, DeviceContext, T, bool>(
+          context, x, y, axis, InverseFunctor(), z);
+    }
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_KERNEL(op_type, dev, functor)                    \
-  REGISTER_OP_##dev##_KERNEL(                                             \
-      op_type, ::paddle::operators::CompareOpKernel<                      \
-                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
-      ::paddle::operators::CompareOpKernel<                               \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
+#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor)       \
+  REGISTER_OP_##dev##_KERNEL(op_type,                                         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int>, inverse_functor<int>>,         \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<int64_t>, inverse_functor<int64_t>>, \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<float>, inverse_functor<float>>,     \
+                             ::paddle::operators::CompareOpKernel<            \
+                                 ::paddle::platform::dev##DeviceContext,      \
+                                 functor<double>, inverse_functor<double>>);
diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc
new file mode 100644
index 0000000000000..721354954c703
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_op.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+
+namespace paddle {
+namespace framework {
+class OpDesc;
+template <typename T>
+class EmptyGradOpMaker;
+}  // namespace framework
+namespace imperative {
+class OpBase;
+}  // namespace imperative
+}  // namespace paddle
+
+using LoDTensor = paddle::framework::LoDTensor;
+using Tensor = paddle::framework::Tensor;
+
+namespace paddle {
+namespace operators {
+
+class CopyCrossScopeOp : public framework::OperatorBase {
+ public:
+  CopyCrossScopeOp(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const {}
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int num_micro_scopes = scope.kids().size();
+    int num_micro_batches = Attr<int>("num_micro_batches");
+    bool ToM = Attr<bool>("to_main_scope");
+    PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches,
+                      platform::errors::InvalidArgument(
+                          "For pipeline, number of micro scopes (%d) should "
+                          "be equal to number of micro batches (%d).",
+                          num_micro_scopes, num_micro_batches));
+    const std::string& id_name = Input("Id");
+    auto* id_var = scope.FindVar(id_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        id_var,
+        platform::errors::NotFound("No variable with name %s found.", id_name));
+    auto id_tensor = id_var->GetMutable<LoDTensor>();
+    auto it = scope.kids().begin();
+    framework::Tensor cpu_id_tensor;
+    TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor);
+    auto id_value = cpu_id_tensor.data<int64_t>();
+    for (auto i = 0; i < *id_value; i++) {
+      it++;
+    }
+    if (it == scope.kids().end()) {
+      if (ToM) {
+        auto dst_scope = *it;
+        const std::string& x_name = Input("X");
+        auto* dst_var = dst_scope->FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            dst_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in source scope.", x_name));
+        auto* main_var = scope.FindVar(x_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            main_var,
+            platform::errors::NotFound(
+                "No variable with name %s found in destination scope.",
+                x_name));
+        auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+        auto main_tensor = main_var->GetMutable<LoDTensor>();
+        TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+      }
+      return;
+    }
+    auto source_scope = *it;
+    it++;
+    auto dst_scope = *it;
+    const std::string& x_name = Input("X");
+    auto* source_var = source_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        source_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in source scope.", x_name));
+    auto* dst_var = dst_scope->FindVar(x_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        dst_var,
+        platform::errors::NotFound(
+            "No variable with name %s found in destination scope.", x_name));
+    auto src_tensor = source_var->GetMutable<LoDTensor>();
+    auto dst_tensor = dst_var->GetMutable<LoDTensor>();
+    TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor);
+
+    if (ToM) {
+      auto* main_var = scope.FindVar(x_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          main_var,
+          platform::errors::NotFound(
+              "No variable with name %s found in destination scope.", x_name));
+      auto main_tensor = main_var->GetMutable<LoDTensor>();
+      TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor);
+    }
+  }
+};
+
+class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), The first input tensor of copy_cross_scope op, which "
+             "is copying micro scope.");
+    AddInput("Id",
+             "(Tensor), The second input tensor of copy_cross_scope op, which "
+             "is a id of the current micro scope.");
+    AddAttr<bool>("to_main_scope", "Return current scope to main scope.")
+        .SetDefault(false);
+    AddAttr<int>("num_micro_batches", "Number of micro batches for pipeline.");
+    AddComment(R"DOC(
+      This op is used by pipeline to copy tensors across micro batch scopes. 
+      Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. 
+      If need to copy back to the main scope, using to_main_scope option to copy the variable value of 
+      the current micro scope to the main scope.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp,
+                             ops::CopyCrossScopeOpMaker);
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
new file mode 100644
index 0000000000000..e175b235f9c18
--- /dev/null
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <iostream>
+#include <list>
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/copy_cross_scope_op.cc"
+#include "paddle/fluid/string/printf.h"
+
+#define Conn(x, y) x##y
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_NO_KERNEL_OP(copy_cross_scope);
+
+template <typename T>
+void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {1};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::list<f::Scope*>::const_iterator iter = scope->kids().begin();
+  iter++;
+  iter++;
+
+  auto* kid_scope = *iter;
+  auto* dst_var = kid_scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 1;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+template <typename T>
+void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
+              std::string op_type) {
+  // init
+  auto var_x = scope->Var("tmp");
+  auto x = var_x->GetMutable<f::LoDTensor>();
+  std::vector<T> main_x = {1.0};
+  TensorFromVector(main_x, ctx, x);
+
+  auto var_id = scope->Var("Id");
+  auto id = var_id->GetMutable<f::LoDTensor>();
+  std::vector<int64_t> main_id = {0};
+  TensorFromVector(main_id, ctx, id);
+  for (int i = 0; i < 3; i++) {
+    auto& child_scope = scope->NewScope();
+    auto child_var = child_scope.Var("tmp");
+    auto tensor_x = child_var->GetMutable<f::LoDTensor>();
+    std::vector<T> init_x = {static_cast<T>(i)};
+    TensorFromVector(init_x, ctx, tensor_x);
+  }
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
+  std::map<std::string, std::vector<std::string>> output;
+  auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
+                                    output, attrs);
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  auto* dst_var = scope->FindVar("tmp");
+  auto* tensor_out = dst_var->GetMutable<f::LoDTensor>();
+
+  std::vector<T> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  int expected = 0;
+  EXPECT_EQ(static_cast<int>(out_vec[0]), expected);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(copy_cross_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, CUDA_fp32) {
+  f::Scope scope;
+  p::CUDADeviceContext ctx(p::CUDAPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#elif PADDLE_WITH_ASCEND_CL
+TEST(copy_cross_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare1<float>(&scope, ctx, "copy_cross_scope");
+}
+
+TEST(copy_cross_scope_to_main_scope, NPU_fp32) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare2<float>(&scope, ctx, "copy_cross_scope");
+}
+#endif
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index a2279e40623b4..6a34ef48a169d 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
   auto sign =
       (x_minux_y > static_cast<T>(0)).template cast<T>() * static_cast<T>(1.0) +
       (x_minux_y < static_cast<T>(0)).template cast<T>() * static_cast<T>(-1.0);
+  T epsilon = static_cast<T>(1.0e-10f);
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
@@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
     // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout
     if (platform::is_cpu_place(context.GetPlace())) {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) *
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
           sign.eval() * out_grad_t.broadcast(out_bcast_dims);
     } else {
       grad_t.device(place) =
-          (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign *
-          out_grad_t.broadcast(out_bcast_dims);
+          (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims))
+              .pow(p - 1) *
+          sign * out_grad_t.broadcast(out_bcast_dims);
     }
   }
 
diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt
new file mode 100644
index 0000000000000..4fe9cf214eaa7
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/CMakeLists.txt
@@ -0,0 +1,54 @@
+# compile flags
+set(DLNNE_FLAGS
+  -Wno-error=non-virtual-dtor
+  -Wno-error=unused-variable
+  -Wno-error=attributes
+  ${fsanitize}
+)
+foreach(flag ${DLNNE_FLAGS})
+  safe_set_cflag(CMAKE_C_FLAGS ${flag})
+  safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
+endforeach()
+
+
+# add nne
+find_path(DLNNE_INCLUDE_DIR dlnne.h
+  PATHS
+  $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include
+  NO_DEFAULT_PATH
+)
+
+find_library(DLNNE_LIB libdlnne.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne
+  NO_DEFAULT_PATH
+)
+
+find_path(CUDA_INCLUDE_DIR cuda.h
+  $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include
+)
+
+find_library(CURT_LIB libcurt.so
+  PATHS
+  $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib
+  NO_DEFAULT_PATH
+)
+
+
+message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR})
+message("DLNNE_LIB: "${DLNNE_LIB})
+message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR})
+message("CURT_LIB: "${CURT_LIB})
+
+include_directories("${DLNNE_INCLUDE_DIR}")
+include_directories("${CUDA_INCLUDE_DIR}")
+
+op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope)
+
+#message("PYBIND_FILE:${pybind_file}")
+#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n")
+#endif()
+
+target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB})
+
+cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis)
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
new file mode 100644
index 0000000000000..4654e6a9f978a
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+
+namespace paddle {
+namespace inference {
+
+void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+}
+void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) {
+  cudaDeviceSynchronize();
+  cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+}
+
+}  // namespace inference
+
+namespace operators {
+
+class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Xs", "A list of inputs.").AsDuplicable();
+    AddOutput("Ys", "A list of outputs").AsDuplicable();
+    AddAttr<std::string>("subgraph", "the subgraph.");
+    AddAttr<std::string>(
+        "engine_key",
+        "The engine_key here is used to distinguish different DLNNE Engines");
+    AddAttr<framework::BlockDesc*>("sub_block", "the trt block");
+    AddComment("Dlnne engine operator.");
+  }
+};
+
+class DlnneEngineInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker);
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
new file mode 100644
index 0000000000000..d426876c18fa5
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h
@@ -0,0 +1,351 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>          // NOTLINT
+#include <cuda_runtime.h>  // NOTLINT
+#include <dlnne.h>         // NOTLINT
+
+#include <assert.h>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace dl {
+namespace nne {
+class Builder;
+class Engine;
+class Network;
+class Parser;
+class ExecutionContext;
+}  // namespace nne
+}  // namespace dl
+
+namespace paddle {
+namespace inference {
+class NneDeleter {
+ public:
+  NneDeleter() {}
+
+  template <typename T>
+  inline void operator()(T *ptr) {
+    if (ptr != nullptr) {
+      ptr->Destroy();
+    }
+  }
+};
+
+void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes);
+
+void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes);
+
+template <typename T>
+struct Singleton;
+}  // namespace inference
+}  // namespace paddle
+
+namespace paddle {
+
+namespace operators {
+
+class DlnneEngineOp : public framework::OperatorBase {
+ private:
+  std::vector<std::string> input_names_;
+  std::unordered_set<std::string> param_names_;
+  std::string engine_key_;
+  int num_inputs;
+  int num_outputs;
+  std::vector<std::string> output_names;
+  std::vector<std::string> input_names;
+
+  dl::nne::Builder *builder;
+  dl::nne::Parser *parser;
+  dl::nne::Network *network;
+  dl::nne::ExecutionContext *context;
+  dl::nne::Engine *engine;
+
+  unsigned int engine_input_size;
+  std::vector<int> InputIndexToBindIndex_;
+
+ public:
+  DlnneEngineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {
+    input_names_ = Inputs("Xs");
+    engine_key_ = Attr<std::string>("engine_key");
+    auto params = Attr<std::vector<std::string>>("parameters");
+    for (const auto &param : params) {
+      param_names_.insert(param);
+    }
+
+    num_inputs = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+      input_names.push_back(x);
+    }
+
+    num_outputs = Outputs("Ys").size();
+    for (const auto &y : Outputs("Ys")) {
+      VLOG(4) << "y: " << y << std::endl;
+      output_names.push_back(y);
+    }
+
+    // onnx path
+    std::stringstream filename;
+    std::string current_path = ".";
+    char *buffer;
+    if ((buffer = getcwd(NULL, 0)) != NULL) {
+      current_path = buffer;
+    } else {
+      current_path = ".";
+    }
+    filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_
+             << ".onnx";
+
+    builder = dl::nne::CreateInferBuilder();
+    PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable(
+                                            "nne create builder failed"));
+    parser = dl::nne::CreateParser();
+    PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable(
+                                           "nne create parser failed"));
+
+    network = builder->CreateNetwork();
+
+    LOG(INFO) << "set output for dlnne";
+    for (std::string &output_op_name : output_names)
+      parser->RegisterOutput(output_op_name.c_str());
+
+    LOG(INFO) << "parser onnx for dlnne";
+    parser->Parse(filename.str().c_str(), *network);
+
+    LOG(INFO) << "build network";
+    engine = builder->BuildEngine(*network);
+
+    // total size = input_size+output_size
+    engine_input_size = num_inputs + num_outputs;
+    for (std::string &input_name : input_names) {
+      int BindIndex = engine->GetBindingIndex(input_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    for (std::string &output_name : output_names) {
+      int BindIndex = engine->GetBindingIndex(output_name.c_str());
+      InputIndexToBindIndex_.push_back(BindIndex);
+    }
+
+    // context
+    context = engine->CreateExecutionContext();
+  }
+
+  ~DlnneEngineOp() {
+    network->Destroy();
+    context->Destroy();
+    engine->Destroy();
+    parser->Destroy();
+    builder->Destroy();
+  }
+
+ protected:
+  void RunDlnneOnCreateEngine(const framework::Scope &scope,
+                              const platform::Place &dev_place) const {
+    PADDLE_ENFORCE_EQ(
+        input_names_.empty(), false,
+        platform::errors::PreconditionNotMet(
+            "Dlnne engine needs at least one input, but no input is found. "
+            "Please check if you set the input correctly."));
+
+    std::vector<void *> input_buffers(num_inputs);
+    std::vector<void *> cpu_input_buffers(num_inputs);
+    std::vector<std::vector<int64_t>> input_shapes(num_inputs);
+    std::vector<int32_t> input_data_types(num_inputs);
+    std::vector<int64_t> input_bytes(num_inputs);
+
+    int index = 0;
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      // convert input and copy to Dlnne engine's buffer
+      auto &t =
+          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+
+      const int bind_index = index;
+      index++;
+      int64_t data_bytes;
+      int32_t dtype;
+      auto type = t.type();
+      data_bytes = 1;
+      void *buffer = nullptr;
+      if (type == framework::proto::VarType::FP32) {
+        buffer = static_cast<void *>(t.data<float>());
+        data_bytes = 4;
+        dtype = 0;
+      } else if (type == framework::proto::VarType::INT64) {
+        buffer = static_cast<void *>(t.data<int64_t>());
+        data_bytes = 8;
+        dtype = 1;
+      } else if (type == framework::proto::VarType::INT32) {
+        buffer = static_cast<void *>(t.data<int32_t>());
+        data_bytes = 4;
+        dtype = 2;
+      } else {
+        PADDLE_THROW(platform::errors::Fatal(
+            "The DLNNE Engine OP only support float/int32_t/int64_t input."));
+      }
+      input_buffers[bind_index] = buffer;
+
+      auto t_shape = framework::vectorize<int64_t>(t.dims());
+      std::vector<int64_t> runtime_input_shape(t_shape.begin(), t_shape.end());
+      for (auto &size : t_shape) {
+        data_bytes = data_bytes * size;
+      }
+
+      VLOG(4) << "buffers_size:" << data_bytes;
+      cpu_input_buffers[bind_index] =
+          input_buffers[bind_index];  // malloc(data_bytes);
+      input_shapes[bind_index] = runtime_input_shape;
+      input_data_types[bind_index] = dtype;
+      input_bytes[bind_index] = data_bytes;
+    }
+
+    // output shape
+    std::vector<std::vector<int64_t>> out_shapes;
+    std::vector<int32_t> output_bytes;
+    for (int i = 0; i < num_outputs; i++) {
+      int index = engine->GetBindingIndex(output_names[i].c_str());
+      dl::nne::Dims out_dim = engine->GetBindingDimensions(index);
+      std::vector<int64_t> shape(out_dim.nbDims);
+      for (int dim = 0; dim < out_dim.nbDims; dim++) {
+        shape[dim] = (out_dim.d[dim]);
+      }
+
+      out_shapes.push_back(shape);
+      int64_t data_bytes;
+
+      // float32
+      data_bytes = 4;
+      for (auto &size : shape) {
+        data_bytes = data_bytes * size;
+      }
+      VLOG(4) << "data_bytes: " << data_bytes;
+      output_bytes.push_back(data_bytes);
+    }
+
+    int bind_index = 0;
+    std::vector<void *> cpu_output_buffers(num_outputs);
+    std::vector<void *> output_buffers(num_outputs);
+    std::vector<int32_t> output_dtypes(num_outputs);
+
+    for (const auto &y : Outputs("Ys")) {
+      auto *fluid_v = scope.FindVar(y);
+      PADDLE_ENFORCE_NOT_NULL(
+          fluid_v,
+          platform::errors::NotFound(
+              "Output variable %s is not found in DLNNE subgraph.", y));
+
+      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
+
+      VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size();
+      fluid_t->Resize(framework::make_ddim(out_shapes[bind_index]));
+
+      int32_t dtype;
+      output_buffers[bind_index] = fluid_t->mutable_data<float>(
+          BOOST_GET_CONST(platform::CPUPlace, dev_place));
+      dtype = 0;
+      cpu_output_buffers[bind_index] =
+          output_buffers[bind_index];  // malloc(data_bytes);
+      output_dtypes[bind_index] = dtype;
+      bind_index++;
+    }
+
+    std::vector<void *> engine_input_ptr(engine_input_size);
+
+    // set input_ptr
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (engine->BindingIsInput(InputIndexToBindIndex_[i])) {
+        // copy cpu buffer to gpu buffer
+        int64_t total_bytes;
+        total_bytes = input_bytes[i];
+        VLOG(4) << "input_bytes: " << total_bytes;
+
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_bytes);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+
+        paddle::inference::CopyTensorCpuToDevice(
+            gpu_ptr, reinterpret_cast<void *>(cpu_input_buffers[i]),
+            total_bytes);
+
+      } else {
+        int64_t total_size;
+        total_size = output_bytes[i - input_names.size()];
+        VLOG(4) << "output_bytes: " << total_size;
+        void *gpu_ptr;
+        cudaMalloc(&gpu_ptr, total_size);
+        engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr;
+      }
+    }
+
+    clock_t startTime, endTime;
+    startTime = clock();
+    context->Execute(1, engine_input_ptr.data());
+    endTime = clock();
+    double during_ms =
+        static_cast<double>(endTime - startTime) / CLOCKS_PER_SEC * 1000;
+    LOG(INFO) << "dlNNE execute time: " << during_ms << " ms";
+
+    bind_index = 0;
+    for (unsigned int i = 0; i < engine_input_size; i++) {
+      if (InputIndexToBindIndex_[i] < 0) continue;
+
+      if (i >= input_names.size()) {
+        void *cpu_ptr = cpu_output_buffers[i - input_names.size()];
+        int64_t size;
+        size = output_bytes[i - input_names.size()];
+        paddle::inference::CopyTensorDeviceToCpu(
+            cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size);
+        // dtype: float32
+        int32_t dtypes;
+        dtypes = 0;
+
+        cpu_output_buffers[bind_index] = cpu_ptr;
+        output_dtypes[bind_index] = dtypes;
+        bind_index++;
+      }
+      cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]);
+    }
+  }
+
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    RunDlnneOnCreateEngine(scope, dev_place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
new file mode 100644
index 0000000000000..caf1a80fcc737
--- /dev/null
+++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc
@@ -0,0 +1,237 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
+
+USE_NO_KERNEL_OP(dlnne_engine);
+namespace paddle {
+namespace operators {
+
+namespace {
+void CreateCUDATensor(framework::Scope* scope, const std::string& name,
+                      const std::vector<int64_t>& shape) {
+  auto* var = scope->Var(name);
+  auto* tensor = var->GetMutable<framework::LoDTensor>();
+  auto dims = framework::make_ddim(shape);
+  tensor->Resize(dims);
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  inference::tensorrt::RandomizeTensor(tensor, place, ctx);
+}
+
+void AddTensorToBlockDesc(framework::proto::BlockDesc* block,
+                          const std::string& name,
+                          const std::vector<int64_t>& shape) {
+  using framework::proto::VarType;
+  auto* var = block->add_vars();
+  framework::VarDesc desc(name);
+  desc.SetType(VarType::LOD_TENSOR);
+  desc.SetDataType(VarType::FP32);
+  desc.SetShape(shape);
+  *var = *desc.Proto();
+}
+
+}  // namespace
+
+using inference::analysis::SetAttr;
+
+TEST(DlnneEngineOp, manual) {
+  framework::ProgramDesc program;
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+  LOG(INFO) << "create fc op";
+  auto* fc0 = block_desc.AppendOp();
+  fc0->SetType("fc");
+  fc0->SetInput("X", std::vector<std::string>({"x"}));     // 4 x 1 x 1
+  fc0->SetInput("Y", std::vector<std::string>({"y"}));     // 4 x 6
+  fc0->SetOutput("Out", std::vector<std::string>({"z"}));  // 6 x 1 x 1
+
+  LOG(INFO) << "create fc op";
+  auto* fc1 = block_desc.AppendOp();
+  fc1->SetType("fc");
+  fc1->SetInput("X", std::vector<std::string>({"z"}));
+  fc1->SetInput("Y", std::vector<std::string>({"y0"}));     // 6 x 8
+  fc1->SetOutput("Out", std::vector<std::string>({"z0"}));  // 8 x 1 x 1
+
+  // Set inputs' variable shape in BlockDesc
+  // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
+  AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
+  AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
+  AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
+  AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
+
+  // It is wired, need to copy manually.
+  *block_->add_ops() = *fc0->Proto();
+  *block_->add_ops() = *fc1->Proto();
+
+  ASSERT_EQ(block_->ops_size(), 2);
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(2));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters", std::vector<std::string>({}));
+  engine_op_desc.SetAttr("engine_key", std::string("a_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("a_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z0"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  LOG(INFO) << "create engine op";
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+  LOG(INFO) << "engine_op " << engine_op.get();
+
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+  // Prepare variables.
+  CreateCUDATensor(&scope, "x", std::vector<int64_t>({2, 4}));
+  CreateCUDATensor(&scope, "y", std::vector<int64_t>({4, 6}));
+  CreateCUDATensor(&scope, "z", std::vector<int64_t>({2, 6}));
+
+  CreateCUDATensor(&scope, "y0", std::vector<int64_t>({6, 8}));
+  CreateCUDATensor(&scope, "z0", std::vector<int64_t>({2, 8}));
+
+  // Execute them.
+  LOG(INFO) << "engine_op run";
+  engine_op->Run(scope, place);
+}
+
+void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
+  framework::ProgramDesc program;
+  framework::Scope scope;
+  platform::CUDAPlace place;
+  platform::CUDADeviceContext ctx(place);
+
+  auto* block_ = program.Proto()->add_blocks();
+  block_->set_idx(0);
+  block_->set_parent_idx(-1);
+
+  using shape_t = std::vector<int64_t>;
+
+  LOG(INFO) << "create block desc";
+  framework::BlockDesc block_desc(&program, block_);
+
+  auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name,
+                        const std::string& z_name, bool x_created,
+                        const shape_t& x_shape, const shape_t& y_shape,
+                        const shape_t& z_shape) {
+    LOG(INFO) << "create fc op";
+    auto* fc = block_desc.AppendOp();
+    fc->SetType("mul");
+    fc->SetInput("X", std::vector<std::string>({x_name}));
+    fc->SetInput("Y", std::vector<std::string>({y_name}));
+    fc->SetOutput("Out", std::vector<std::string>({z_name}));
+
+    // Set inputs' variable shape in BlockDesc
+    if (!x_created) {
+      AddTensorToBlockDesc(block_, x_name,
+                           std::vector<int64_t>({batch_size, input_dim, 1, 1}));
+    }
+    AddTensorToBlockDesc(block_, y_name,
+                         std::vector<int64_t>({input_dim, output_dim}));
+    AddTensorToBlockDesc(block_, z_name,
+                         std::vector<int64_t>({batch_size, output_dim}));
+
+    // Prepare variables.
+    if (!x_created) {
+      CreateCUDATensor(&scope, x_name, std::vector<int64_t>(x_shape));
+    }
+    CreateCUDATensor(&scope, y_name, std::vector<int64_t>(y_shape));
+    CreateCUDATensor(&scope, z_name, std::vector<int64_t>(z_shape));
+
+    // It is wired, need to copy manually.
+    *block_->add_ops() = *fc->Proto();
+  };
+
+  // Test with 4 layer FC
+  AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim},
+             {input_dim, output_dim}, {batch_size, output_dim});
+  AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+  AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim},
+             {batch_size, output_dim});
+
+  LOG(INFO) << "create dlnne desc";
+  framework::OpDesc engine_op_desc(nullptr);
+  engine_op_desc.SetType("dlnne_engine");
+  engine_op_desc.SetInput("Xs", std::vector<std::string>({"x0"}));
+  engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z3"}));
+
+  engine_op_desc.SetBlockAttr("sub_block", &block_desc);
+  engine_op_desc.SetAttr("max_batch_size", static_cast<int>(batch_size));
+  engine_op_desc.SetAttr("workspace_size", static_cast<int>(1 << 20));
+  engine_op_desc.SetAttr("parameters",
+                         std::vector<std::string>({"y0", "y1", "y2", "y3"}));
+  engine_op_desc.SetAttr("engine_key", std::string("b_engine"));
+  engine_op_desc.SetAttr("calibration_engine_key",
+                         std::string("b_calib_engine"));
+  engine_op_desc.SetAttr("predictor_id", 1);
+  engine_op_desc.SetAttr("calibration_data", std::string(""));
+  engine_op_desc.SetAttr("enable_int8", static_cast<bool>(false));
+  engine_op_desc.SetAttr("enable_fp16", static_cast<bool>(false));
+  engine_op_desc.SetAttr("use_calib_mode", static_cast<bool>(false));
+  engine_op_desc.SetAttr("output_name_mapping",
+                         std::vector<std::string>({"z3"}));
+  engine_op_desc.SetAttr("origin_output_dims", std::vector<int>({2}));
+  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
+  int device_id = 0;
+  engine_op_desc.SetAttr("gpu_id", device_id);
+
+  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
+
+  // Execute them.
+  engine_op->Run(scope, place);
+}
+
+// Test with a larger FC layer.
+TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); }
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_TRT_CONVERTER(fc)
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 0ca03fc32fbf6..5c444e752e797 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/complex128.h"
 #include "paddle/fluid/platform/complex64.h"
@@ -34,7 +33,9 @@ namespace operators {
 */
 template <typename T>
 struct CudaAddFunctor {
-  inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; }
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return args[0] + args[1];
+  }
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
index 36add2112974d..321826ec647c9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -13,6 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#ifdef __HIPCC__
+#define ELEMENTWISE_BLOCK_SIZE 256
+#else
+#define ELEMENTWISE_BLOCK_SIZE 512
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -90,8 +101,7 @@ struct ElementwiseDataWrapper {
 
 template <ElementwiseType ET, int VecSize, typename T, typename Functor>
 __device__ void VectorizedKernelImpl(
-    ElementwiseDataWrapper<ET, VecSize, T> data, int size, Functor func,
-    int tid) {
+    ElementwiseDataWrapper<ET, VecSize, T> data, Functor func, int tid) {
   using VecType = CudaAlignedVector<T, VecSize>;
   VecType ins_vec[ET];
   VecType out_vec;
@@ -121,10 +131,9 @@ __device__ void VectorizedKernelImpl(
   data.store_vector(out_vec, tid);
 }
 
-template <ElementwiseType ET, typename T, typename Functor>
-__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, 1, T> data,
-                                 int size, Functor func, int start,
-                                 int remain) {
+template <ElementwiseType ET, int VecSize, typename T, typename Functor>
+__device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
+                                 Functor func, int start, int remain) {
   T ins[ET];
   T out;
 
@@ -146,12 +155,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = size - VecSize * tid;
   remain = remain > 0 ? remain : 0;
+  auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
   if (remain >= VecSize) {
-    auto data = ElementwiseDataWrapper<ET, VecSize, T>(out, in0, in1);
-    VectorizedKernelImpl(data, size, func, tid);
+    VectorizedKernelImpl(data, func, tid);
   } else {
-    auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
-    ScalarKernelImpl(data, size, func, tid * VecSize, remain);
+    ScalarKernelImpl(data, func, tid * VecSize, remain);
   }
 }
 
@@ -162,7 +170,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0,
   auto data = ElementwiseDataWrapper<ET, 1, T>(out, in0, in1);
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int remain = tid < size ? 1 : 0;
-  ScalarKernelImpl(data, size, func, tid, remain);
+  ScalarKernelImpl(data, func, tid, remain);
 }
 
 template <ElementwiseType ET, typename T, typename Functor>
@@ -173,7 +181,7 @@ void LaunchElementwiseCudaKernel(
   // calculate the max vec_size for all ins and outs
   auto size = ins[0]->numel();
   int vec_size = GetVectorizedSize<T>(ins, *outs);
-  int block_size = PADDLE_CUDA_THREAD_SIZE;
+  int block_size = ELEMENTWISE_BLOCK_SIZE;
   int grid_size =
       ((size + vec_size - 1) / vec_size + block_size - 1) / block_size;
   const T *in0 = ins[0]->data<T>();
diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc
index 8850dc45d24c8..e2bf61de63196 100644
--- a/paddle/fluid/operators/expand_as_op.cc
+++ b/paddle/fluid/operators/expand_as_op.cc
@@ -147,7 +147,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsGradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand_as, ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ExpandAsKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index f0a6e6ec33242..5296a144f6247 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -129,7 +129,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ExpandAsV2GradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand_as_v2,
     ops::ExpandAsV2Kernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 6797639b94721..e7da08ff27711 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -273,7 +273,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ExpandKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index 453a990efbded..bb3a6512d2c8b 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -79,6 +79,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
     expand, ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
                          paddle::platform::float16>);
 
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index e65011bf7de63..618c1560c5eac 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -278,7 +278,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ExpandV2GradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand_v2, ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, float>,
     ops::ExpandV2Kernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index f614d906baa75..87618b954d232 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -55,19 +55,19 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     auto *ids_t = ctx.Input<framework::LoDTensor>("Ids");
-
     auto *output_grad_t =
         ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
     auto *table_grad_t =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
-    auto *p = table_grad_t->mutable_data<T>(ctx.GetPlace());
+    table_grad_t->mutable_data<T>(ctx.GetPlace());
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
-                             table_grad_t->numel() * sizeof(T), stream);
+    auto runner_zeros =
+        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+    runner_zeros.Run(stream);
 
     // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
     // can be different tensor, but in cann 20.2+, it does inplace operation.
@@ -86,9 +86,11 @@ namespace ops = paddle::operators;
 REGISTER_OP_NPU_KERNEL(
     lookup_table_v2,
     ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::LookupTableV2NPUKernel<paddle::platform::NPUDeviceContext,
                                 paddle::platform::float16>);
 
 REGISTER_OP_NPU_KERNEL(
     lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel<float>,
+    ops::LookupTableV2GradNPUKernel<int>,
     ops::LookupTableV2GradNPUKernel<paddle::platform::float16>);
diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc
index 2ad3c9f923a31..54600e26bb57f 100644
--- a/paddle/fluid/operators/meshgrid_op.cc
+++ b/paddle/fluid/operators/meshgrid_op.cc
@@ -157,7 +157,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::MeshgridGradKernel<paddle::platform::CPUDeviceContext, double>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     meshgrid, ops::MeshgridKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeshgridKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 9fcc629233891..843736833f815 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -32,6 +32,12 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
 
+#ifdef __HIPCC__
+#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
+#else
+#define LAUNCH_BOUNDS(BlockDim)
+#endif
+
 namespace paddle {
 namespace operators {
 
@@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout;
 //          axis=(n,h,w)))
 
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDX(const T *x, const T *mean,
-                                    const T *variance, const T *ddx,
-                                    const T *dy, const T *scale,
-                                    const T *ddscale, const int N, const int C,
-                                    const int sample_size, const double epsilon,
-                                    T *dx) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const T *scale, const T *ddscale, const int N, const int C,
+    const int sample_size, const double epsilon, T *dx) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean,
 //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
 //           np.mean(ddx * (x - mean), axis=(n,h,w)))
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDDY(const T *x, const T *mean,
-                                     const T *variance, const T *ddscale,
-                                     const T *ddbias, const T *ddx,
-                                     const T *scale, const int N, const int C,
-                                     const int sample_size,
-                                     const double epsilon, T *ddy) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY(
+    const T *x, const T *mean, const T *variance, const T *ddscale,
+    const T *ddbias, const T *ddx, const T *scale, const int N, const int C,
+    const int sample_size, const double epsilon, T *ddy) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean,
 //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
 //            ddx
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScale(const T *x, const T *mean,
-                                        const T *variance, const T *ddx,
-                                        const T *dy, const int N, const int C,
-                                        const int sample_size,
-                                        const double epsilon, T *dscale) {
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale(
+    const T *x, const T *mean, const T *variance, const T *ddx, const T *dy,
+    const int N, const int C, const int sample_size, const double epsilon,
+    T *dscale) {
   const int outer_size = C;
   const int inner_size = N * sample_size;
 
@@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean,
 
 // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
 template <typename T, int BlockDim, framework::DataLayout layout>
-__global__ void DoubleGradComputeDScaleWithGlobal(
+__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal(
     const T *ddx, const T *variance, const T *dy, const double epsilon,
     const int N, const int C, const int sample_size, T *dscale) {
   int outer_size = C;
@@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     set_constant(dev_ctx, &scale_tmp, static_cast<T>(1));
   }
   const T *scale_data = Scale ? Scale->data<T>() : scale_tmp.data<T>();
-
+#ifdef __HIPCC__
+  const int block = 256;
+#else
   const int block = 512;
+#endif
   int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
   const int max_blocks = std::max(max_threads / block, 1);
   int grid = std::min(C, max_blocks);
@@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
     }
   }
 }
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index b024aca21c382..a922a2bca66ad 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/optimizers/adam_op.h"
 
@@ -122,8 +123,9 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     FillNpuTensorWithConstant<T>(&beta2_tensor, beta2);
 
     Tensor epsilon_tensor(framework::proto::VarType::FP32);
-    epsilon_tensor.mutable_data<T>({1}, ctx.GetPlace());
-    FillNpuTensorWithConstant<T>(&epsilon_tensor, epsilon);
+    TensorFromVector(std::vector<T>{epsilon},
+                     ctx.template device_context<platform::DeviceContext>(),
+                     &epsilon_tensor);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index 0d5c23bed6016..65e10181dcc3d 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -60,33 +60,51 @@ void RunPyObject(py::object *py_object,
           outs->size(), result_tuple.size()));
     }
     for (size_t i = 0; i < result_tuple.size(); i++) {
-      if (Py_None != result_tuple[i].ptr()) {
+      if ((*outs)[i] != nullptr) {
+        if (Py_None != result_tuple[i].ptr()) {
+          try {
+            auto result_var =
+                result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
+            *(*outs)[i] = result_var->Var();
+          } catch (py::cast_error &) {
+            PADDLE_THROW(platform::errors::InvalidArgument(
+                "The output of `PyLayer.backward` should be `Tensor`."));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward needs gradient and the "
+              "corresponding gradient cannot be None.",
+              i));
+        }
+      } else {
+        if (Py_None != result_tuple[i].ptr()) {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The %dth input tensor of forward do not need gradient and the "
+              "corresponding gradient should be `None`.",
+              i));
+        }
+      }
+    }
+  } else {
+    if ((*outs)[0] != nullptr) {
+      if (Py_None != py_result.ptr()) {
         try {
           auto result_var =
-              result_tuple[i].cast<std::shared_ptr<imperative::VarBase>>();
-          *(*outs)[i] = result_var->Var();
+              py_result.cast<std::shared_ptr<imperative::VarBase>>();
+          *((*outs)[0]) = result_var->Var();
         } catch (py::cast_error &) {
-          PADDLE_THROW(platform::errors::Unimplemented(
+          PADDLE_THROW(platform::errors::InvalidArgument(
               "The output of `PyLayer.backward` should be `Tensor`."));
         }
       } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.backward` can not be `None`."));
-      }
-    }
-  } else {
-    if (Py_None != py_result.ptr()) {
-      try {
-        auto result_var =
-            py_result.cast<std::shared_ptr<imperative::VarBase>>();
-        *((*outs)[0]) = result_var->Var();
-      } catch (py::cast_error &) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "The output of `PyLayer.backward` should be `Tensor`."));
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "The input tensor of forward needs gradient, so the output of "
+            "`PyLayer.backward` can not be `None`."));
       }
     } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "The output of `PyLayer.backward` can not be `None`."));
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The input tensor of forward do not need gradient, so the output of "
+          "`PyLayer.backward` should be `None`."));
     }
   }
 }
diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
index 98a68ca69cafd..1aa93c80387e6 100644
--- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
+++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc
@@ -68,7 +68,7 @@ TEST(BlockingQueue, SenderBlockingTest) {
       ++send_count;
     }
   });
-  std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+  std::this_thread::sleep_for(std::chrono::milliseconds(1500));
   q.Close();
   sender.join();
   EXPECT_EQ(send_count, queue_cap);
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
new file mode 100644
index 0000000000000..fb82d18e62f3b
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using TensorList = std::vector<framework::Tensor>;
+
+template <typename TensorType, typename T>
+void reset_parameter_vector(const std::vector<TensorType>& raw_params_vec,
+                            const int& num_layers, const bool& is_bidirec,
+                            std::vector<std::vector<T*>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    params_vec->at(i).resize(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      using remove_cv_t = typename std::remove_cv<T>::type;
+      params_vec->at(i)[j] =
+          raw_params_vec[tensor_idx]->template data<remove_cv_t>();
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class RnnXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+    auto last_h = state[0];
+    auto last_c = state[1];
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    // init the output and allocate the memory
+    output->mutable_data<T>(ctx.GetPlace());
+    last_h->mutable_data<T>(ctx.GetPlace());
+    last_c->mutable_data<T>(ctx.GetPlace());
+    reserve_data->Resize({seq_len * batch_size * hidden_size * 5});
+    reserve_data->mutable_data<T>(ctx.GetPlace());
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto y = output->data<T>();
+    auto last_h_ptr = last_h->data<T>();
+    auto last_c_ptr = last_c->data<T>();
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    // run kernel
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_train<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h,
+        reinterpret_cast<T*>(y), reinterpret_cast<T*>(last_h_ptr),
+        reinterpret_cast<T*>(last_c_ptr), batch_size, input_dim, hidden_size,
+        seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr,
+        reinterpret_cast<T*>(i_f_g_o), reinterpret_cast<T*>(c));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("RnnXPU(lstm) return wrong "
+                                                 "value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RnnXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];
+    auto last_c_grad = state_grad[1];
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {  // has gradient
+      init_h_grad = pre_state_grad[0];
+      init_c_grad = pre_state_grad[1];
+    }
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<T*>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(weight_grad_list, num_layers, is_bidirec,
+                           &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(init_h->dims(), ctx.GetPlace());
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(init_c->dims(), ctx.GetPlace());
+    }
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto y = output->data<T>();
+    auto y_grad = output_grad->data<T>();
+    auto last_h_grad_ptr = last_h_grad->data<T>();
+    auto last_c_grad_ptr = last_c_grad->data<T>();
+    auto x_grad = input_grad->data<T>();
+    auto h_0_grad = init_h_grad ? init_h_grad->data<T>() : nullptr;
+    auto c_0_grad = init_c_grad ? init_c_grad->data<T>() : nullptr;
+    auto w_x_grad = parameter_lists_grad[0][0];
+    auto w_h_grad = parameter_lists_grad[0][1];
+    auto b_x_grad = parameter_lists_grad[0][2];
+    auto b_h_grad = parameter_lists_grad[0][3];
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_grad<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad,
+        (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr,
+        reinterpret_cast<T*>(x_grad), reinterpret_cast<T*>(h_0_grad),
+        reinterpret_cast<T*>(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad,
+        batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr,
+        nullptr, nullptr, nullptr, i_f_g_o, c);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("RnnXPUGrad(lstm) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rnn, ops::RnnXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    rnn_grad, ops::RnnXPUGradKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index ee7210a7784d7..cbfd11834ae47 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -34,6 +34,8 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     float _power = 1.0;
+    VLOG(4) << "scale:" << scale << ", bias:" << bias
+            << " ,bias_after_scale:" << bias_after_scale;
     if (bias_after_scale) {
       out->mutable_data<T>(ctx.GetPlace());
       auto runner =
diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc
index 144e7ceae20c1..2d23e81717abb 100644
--- a/paddle/fluid/operators/scatter_nd_add_op.cc
+++ b/paddle/fluid/operators/scatter_nd_add_op.cc
@@ -50,10 +50,15 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(
         index_dims[index_dims_size - 1], ref_dims_size,
         platform::errors::InvalidArgument(
-            "Input(Index).shape[-1] should be no greater than Input(X).rank"));
+            "The last dimension of Input(Index)'s shape should be no greater "
+            "than the rank of Input(X), but received the last dimension of "
+            "Input(Index)'s shape is %d, the rank of Input(X) is %d.",
+            index_dims[index_dims_size - 1], ref_dims_size));
     PADDLE_ENFORCE_GE(index_dims_size, 2UL,
                       platform::errors::InvalidArgument(
-                          "The rank of Input(Index) should be greater than 1"));
+                          "The rank of Input(Index) should be greater than 1, "
+                          "but received the rank of Input(Index) is %d.",
+                          index_dims_size));
 
     // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:]
     std::vector<int64_t> r_updates_dims;
@@ -66,12 +71,21 @@ class ScatterNdAddOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(
         r_updates_dims.size(), updates_dims_size,
-        platform::errors::InvalidArgument("Updates has wrong shape"));
+        platform::errors::InvalidArgument(
+            "Updates has wrong shape. The shape of Updates and Input(Updates) "
+            "should be same, but received the shape of Updates is %d, "
+            "the shape of Input(Updates) is %d.",
+            r_updates_dims.size(), updates_dims_size));
 
     for (int64_t i = 0; i < updates_dims_size; ++i) {
       PADDLE_ENFORCE_EQ(
           r_updates_dims[i], updates_dims[i],
-          platform::errors::InvalidArgument("Updates has wrong shape"));
+          platform::errors::InvalidArgument(
+              "Updates has wrong shape. The dimensions of Updates and "
+              "Input(Updates) should match, but received Updates's"
+              "%d-th dimension is %d, Input(Updates)'s %d-th "
+              "dimension is %d.",
+              i, r_updates_dims[i], i, updates_dims[i]));
     }
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 3fc40d41c3081..f0faa0c579833 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -41,15 +41,24 @@ class ScatterOp : public framework::OperatorWithKernel {
     auto ref_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_EQ(
         ctx->GetInputDim("Ids").size(), 1,
-        platform::errors::InvalidArgument("Update Ids should be 1-D."));
+        platform::errors::InvalidArgument(
+            "The size of Input(Ids)'s shape should be equal to 1, but "
+            "received the rank of Input(Ids) is %d.",
+            ctx->GetInputDim("Ids").size()));
     PADDLE_ENFORCE_EQ(
         ref_dims.size(), updates_dims.size(),
         platform::errors::InvalidArgument(
-            "Rerence and Updates should have the same shape size."));
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
-                      ctx->GetInputDim("Ids")[0],
-                      platform::errors::InvalidArgument(
-                          "Updates and Ids should have same batch-size."));
+            "Input(X) and Input(Updates) should have the same shape size, "
+            "but received the size of Input(x)'s shape is %d, the size of "
+            "Input(Updates)'s shape is %d.",
+            ref_dims.size(), updates_dims.size()));
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0],
+        platform::errors::InvalidArgument(
+            "Input(Updates) and Input(Ids) should have same batch-size, but"
+            " received Input(Updates)'s batch-size is %d, Input(Ids)'s "
+            "batch-size is %d.",
+            ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0]));
     ctx->SetOutputDim("Out", ref_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 105d61015fcb9..96a132ac6abc2 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -146,22 +146,75 @@ Assignment to a Tensor in static mode.
 )DOC");
   }
 };
+
+template <typename T>
+class SetValueGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    if (this->HasInput("ValueTensor")) {
+      op->SetType("slice");
+      op->SetInput("Input", this->OutputGrad("Out"));
+      if (this->HasInput("StartsTensorList")) {
+        op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+      }
+      if (this->HasInput("EndsTensorList")) {
+        op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
+      }
+
+      // convert std::vector<int64_t > to std::vector<int >
+      std::vector<int64_t> axes_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("axes")));
+      std::vector<int64_t> starts_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("starts")));
+      std::vector<int64_t> ends_int64 = static_cast<std::vector<int64_t>>(
+          BOOST_GET_CONST(std::vector<int64_t>, this->GetAttr("ends")));
+      std::vector<int64_t> decrease_axes_int64 =
+          static_cast<std::vector<int64_t>>(BOOST_GET_CONST(
+              std::vector<int64_t>, this->GetAttr("decrease_axes")));
+
+      std::vector<int> axes(axes_int64.begin(), axes_int64.end());
+      std::vector<int> starts(starts_int64.begin(), starts_int64.end());
+      std::vector<int> ends(ends_int64.begin(), ends_int64.end());
+      std::vector<int> decrease_axes(decrease_axes_int64.begin(),
+                                     decrease_axes_int64.end());
+
+      op->SetAttr("axes", axes);
+      op->SetAttr("starts", starts);
+      op->SetAttr("ends", ends);
+      op->SetAttr("decrease_axis", decrease_axes);
+      op->SetAttr("infer_flags", std::vector<int>({}));
+
+      op->SetOutput("Out", this->InputGrad("ValueTensor"));
+    } else {
+      op->SetType("assign");
+      op->SetInput("X", this->OutputGrad("Out"));
+      op->SetOutput("Out", this->InputGrad("Input"));
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"});
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-REGISTER_OPERATOR(
-    set_value, ops::SetValue, ops::SetValueMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker,
+                  ops::SetValueGradMaker<paddle::framework::OpDesc>,
+                  ops::SetValueGradMaker<paddle::imperative::OpBase>,
+                  ops::SetValueOpInplaceInferer);
 
 REGISTER_OP_CPU_KERNEL(
     set_value, ops::SetValueKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::SetValueKernel<paddle::platform::CPUDeviceContext, bool>);
+    ops::SetValueKernel<plat::CPUDeviceContext, int64_t>,
+    ops::SetValueKernel<plat::CPUDeviceContext, float>,
+    ops::SetValueKernel<plat::CPUDeviceContext, double>,
+    ops::SetValueKernel<plat::CPUDeviceContext, bool>);
 
 REGISTER_OP_VERSION(set_value)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 9c30c4e07fa77..22f6fa9e3e6f2 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -259,7 +259,20 @@ class SliceKernel : public framework::OpKernel<T> {
     auto out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *out, new_out_dims);
-    out_t.device(place) = in_t.slice(offsets, extents);
+
+    if (in->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.slice:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::DSizes<int, D> offsets_32bit, extents_32bit;
+      for (size_t i = 0; i < D; i++) {
+        offsets_32bit[i] = offsets[i];
+        extents_32bit[i] = extents[i];
+      }
+      framework::To32BitIndex(out_t).device(place) =
+          framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit);
+    } else {
+      out_t.device(place) = in_t.slice(offsets, extents);
+    }
 
     out->Resize(out_dims);
   }
@@ -300,8 +313,6 @@ class SliceGradKernel : public framework::OpKernel<T> {
  private:
   template <size_t D>
   void SliceCompute(const framework::ExecutionContext& context) const {
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
     auto axes = context.Attr<std::vector<int>>("axes");
 
     auto starts_int = context.Attr<std::vector<int>>("starts");
@@ -435,13 +446,189 @@ class SliceGradKernel : public framework::OpKernel<T> {
       paddings[i].first = offsets[i];
       paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
     }
+    EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings);
+  }
+
+  template <size_t D>
+  void EigenPaddingCompute(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    if (D <= 3) {
+      // if dimension less than 3, cannot reduce dimension
+      LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims, paddings);
+    } else {  // else we can reduce dimension
+      // count not-zero padding number, and record the dimension
+      int need_pad_num = 0, pad_dim = -1;
+      for (size_t i = 0; i < D; i++) {
+        if (paddings[i].first != 0 || paddings[i].second != 0) {
+          need_pad_num++;
+          pad_dim = i;
+        }
+      }
+
+      if (need_pad_num == 0) {
+        // do not need padding, pass if data address same, else copy
+        if (d_input->mutable_data<T>(context.GetPlace()) == d_out->data<T>()) {
+          // inplace, do not any operator, pass
+        } else {
+          framework::TensorCopy(
+              *d_out, context.GetPlace(),
+              context.template device_context<platform::DeviceContext>(),
+              d_input);
+        }
+      } else if (need_pad_num == 1) {
+        // only need padding one dimension, we can reduce dimension.
+        // only the padding dimension is available for us.
+        // How to reduce dimension(5 to 3 for example):
+        // before(D=5):
+        // in_dims:        [x1,  x2,  x3,  x4,  x5]
+        // padding.first:  [0,   0,   a,   0,  0]
+        // padding.second: [0,   0,   b,   0,  0]
+        //                     | |
+        //                     V V
+        // after(D=3):
+        // reshaped_in_dims:        [x1*x2,  x3,  x4*x5]
+        // reshaped_padding.first:  [0,      a,     0]
+        // reshaped_padding.second: [0,      b,     0]
+
+        if (pad_dim == D - 1) {
+          // only last dimension need padding,
+          // reshape the dimension of tensor in 2: [preceding, padding]
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape: the first dimension do not need padding,
+          // set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[0].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else if (pad_dim == 0) {
+          // only first dimension need padding,
+          // reshape the dimension of tensor in 2: [padding, succeeding]
+          // similar to (D - 1)
+          std::vector<int64_t> in_tore_shape(2, 1), out_tore_shape(2, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 2> reshaped_padding;
+
+          // first dimension is the padding dimension
+          in_tore_shape[0] = in_dims[pad_dim];
+          out_tore_shape[0] = out_dims[pad_dim];
+          // sencond dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[1] *= in_dims[i];
+            out_tore_shape[1] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension is the previous padding dimension
+          reshaped_padding[0].first = paddings[pad_dim].first;
+          reshaped_padding[0].second = paddings[pad_dim].second;
+          // the second dimension do not need padding, set padding[1] zero
+          reshaped_padding[1].first = reshaped_padding[1].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        } else {
+          // other dimension need padding
+          // reshape the dimension of tensor in 3:
+          // [preceding, padding, succeeding]
+          std::vector<int64_t> in_tore_shape(3, 1), out_tore_shape(3, 1);
+          Eigen::array<std::pair<int64_t, int64_t>, 3> reshaped_padding;
+
+          // first dimension is the accumulate of preceding dimension
+          for (int i = 0; i < pad_dim; i++) {
+            in_tore_shape[0] *= in_dims[i];
+            out_tore_shape[0] *= out_dims[i];
+          }
+          // second dimension is the padding dimension
+          in_tore_shape[1] = in_dims[pad_dim];
+          out_tore_shape[1] = out_dims[pad_dim];
+          // third dimension is the accumulate of succeeding dimension
+          for (size_t i = pad_dim + 1; i < D; i++) {
+            in_tore_shape[2] *= in_dims[i];
+            out_tore_shape[2] *= out_dims[i];
+          }
+
+          // convert array from std::vector to DDim
+          framework::DDim reshaped_in_dims =
+              framework::make_ddim(in_tore_shape);
+          framework::DDim reshaped_out_dims =
+              framework::make_ddim(out_tore_shape);
+
+          // after reshape:
+          // the first dimension do not need padding, set padding[0] zero
+          reshaped_padding[0].first = reshaped_padding[2].second = 0;
+          // the second dimension is the previous padding dimension
+          reshaped_padding[1].first = paddings[pad_dim].first;
+          reshaped_padding[1].second = paddings[pad_dim].second;
+          // the third dimension do not need padding, set padding[2] zero
+          reshaped_padding[2].first = reshaped_padding[2].second = 0;
+
+          LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out,
+                             reshaped_out_dims, reshaped_padding);
+        }
+      } else {
+        // need padding at many dimension, cannot reduce dimension
+        LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims,
+                           paddings);
+      }
+    }
+  }
+
+  template <size_t D>
+  void LaunchEigenPadding(
+      const framework::ExecutionContext& context, framework::Tensor* d_input,
+      const framework::DDim& in_dims, const framework::Tensor* d_out,
+      const framework::DDim& out_dims,
+      const Eigen::array<std::pair<int64_t, int64_t>, D>& paddings) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
     auto d_in_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-            *d_input);
+            *d_input, in_dims);
     auto d_out_t =
         framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
             *d_out, out_dims);
-    d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+
+    if (d_input->numel() <= Eigen::NumTraits<int>::highest()) {
+      // similar to tf.pad:
+      // if element number less than INT_MAX, change the type of index to int
+      Eigen::array<std::pair<int, int>, D> paddings_32bit;
+      for (size_t i = 0; i < D; i++) {
+        paddings_32bit[i] =
+            std::make_pair(paddings[i].first, paddings[i].second);
+      }
+      framework::To32BitIndex(d_in_t).device(place) =
+          framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0));
+    } else {
+      d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+    }
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index e5e0dafdae0b1..9974536da9acb 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -124,11 +124,13 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_NPU_KERNEL(
     slice, ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::SliceNPUKernel<paddle::platform::NPUDeviceContext,
                         paddle::platform::float16>);
 
 REGISTER_OP_NPU_KERNEL(
     slice_grad,
     ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::SliceGradNPUKernel<paddle::platform::NPUDeviceContext,
                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc
index a8b1c7235448b..b98e620cc2d34 100644
--- a/paddle/fluid/operators/tile_op.cc
+++ b/paddle/fluid/operators/tile_op.cc
@@ -286,7 +286,7 @@ REGISTER_OP_CPU_KERNEL(
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TileGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     tile, ops::TileKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TileKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 1d3fc14cdd16d..0827d6a5ae764 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -106,11 +106,11 @@ ELSE()
 ENDIF()
 
 IF(WITH_ASCEND_CL)
-cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 
 IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
@@ -136,7 +136,7 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool
     place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
+cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto  device_context enforce)
 
 if(WITH_GPU OR WITH_ROCM)
     cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info)
@@ -190,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
 
 IF(WITH_GPU)
   nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
+  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
   nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
   nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 ENDIF()
diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h
index 7afed121a5acb..213013f5b1277 100644
--- a/paddle/fluid/platform/ascend_npu_info.h
+++ b/paddle/fluid/platform/ascend_npu_info.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
index 6cb4901f1dde3..a362e2903f245 100644
--- a/paddle/fluid/platform/bfloat16.h
+++ b/paddle/fluid/platform/bfloat16.h
@@ -21,6 +21,15 @@
 #include <iostream>
 #include <limits>
 
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+#define PADDLE_CUDA_BF16
+#include <cuda_bf16.h>
+#endif
+
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 {
  public:
   uint16_t x;
 
+  // Constructors
   bfloat16() = default;
   bfloat16(const bfloat16& o) = default;
   bfloat16& operator=(const bfloat16& o) = default;
@@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
+#else
+#if defined(PADDLE_CUDA_BF16)
+    __nv_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
 #endif
+#endif
+  }
+
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
   }
+#endif
 
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
 
+// Assignment operators
+#if defined(PADDLE_CUDA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);
+    return *this;
+  }
+#endif
+
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 {
     return *this;
   }
 
+  // Conversion opertors
   HOSTDEVICE inline explicit operator float() const {
+#ifdef PADDLE_CUDA_BF16
+    return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#else
     float val = 0.f;
     uint16_t temp = x;
     memcpy(reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp),
            2);
     return val;
+#endif
+  }
+
+#ifdef PADDLE_CUDA_BF16
+  HOSTDEVICE inline explicit operator __nv_bfloat16() const {
+    return *reinterpret_cast<const __nv_bfloat16*>(&x);
   }
+#endif
 
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
@@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) {
   return res;
 }
 
+// Comparison operators
 HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) {
   return static_cast<float>(a) == static_cast<float>(b);
 }
diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu
new file mode 100644
index 0000000000000..dbbb72920a53b
--- /dev/null
+++ b/paddle/fluid/platform/bfloat16_test.cu
@@ -0,0 +1,124 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/bfloat16.h"
+
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#if defined(PADDLE_CUDA_BF16)
+namespace paddle {
+namespace platform {
+
+TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) {
+  // Convert float32 to bfloat16
+  EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80);
+  EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00);
+  EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab);
+  EXPECT_EQ((bfloat16(0.0f)).x, 0x0000);
+  EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000);
+  EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780);
+}
+
+TEST(bfloat16, assignment_operator_on_gpu) {
+  // Assignment operator
+  bfloat16 v_assign;
+  v_assign = nv_bfloat16(bfloat16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3f80);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3eab);
+}
+
+TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) {
+  // Conversion operator
+  EXPECT_EQ(static_cast<float>(bfloat16(0.5f)), 0.5f);
+  EXPECT_NEAR(static_cast<double>(bfloat16(0.33333)), 0.33333, 0.01);
+  EXPECT_EQ(static_cast<int>(bfloat16(-1)), -1);
+  EXPECT_EQ(static_cast<bool>(bfloat16(true)), true);
+}
+
+TEST(bfloat16, lod_tensor_on_gpu) {
+  framework::LoDTensor src_tensor;
+  framework::LoDTensor gpu_tensor;
+  framework::LoDTensor dst_tensor;
+
+  bfloat16 *src_ptr = src_tensor.mutable_data<bfloat16>(
+      framework::make_ddim({2, 2}), CPUPlace());
+
+  bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f),
+                     bfloat16(0.0f)};
+  memcpy(src_ptr, arr, 4 * sizeof(bfloat16));
+
+  // CPU LoDTensor to GPU LoDTensor
+  CUDAPlace gpu_place(0);
+  CUDADeviceContext gpu_ctx(gpu_place);
+  framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor);
+
+  // GPU LoDTensor to CPU LoDTensor
+  framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor);
+
+  // Sync before comparing LoDTensors
+  gpu_ctx.Wait();
+  const bfloat16 *dst_ptr = dst_tensor.data<bfloat16>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 4; ++i) {
+    EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x);
+  }
+}
+
+TEST(bfloat16, isinf) {
+  bfloat16 a;
+  a.x = 0x7f80;
+  bfloat16 b = bfloat16(INFINITY);
+  bfloat16 c = static_cast<bfloat16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(bfloat16, isnan) {
+  bfloat16 a;
+  a.x = 0x7fff;
+  bfloat16 b = bfloat16(NAN);
+  bfloat16 c = static_cast<bfloat16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
+TEST(bfloat16, cast) {
+  bfloat16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    bfloat16 c = reinterpret_cast<bfloat16 &>(reinterpret_cast<unsigned &>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t &>(b);
+    bfloat16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 197f905ba68a2..b0b857f7ee3f2 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -22,6 +22,7 @@
 #include "boost/variant.hpp"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/dynload/hccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -126,6 +127,113 @@ class NCCLCommContext {
 };
 #endif
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+// In order to apply hierarchical communication with HCCL, we need
+// a communication ring contains HCCL communicators associated to a global
+// HCCLUniqueId. E.g. for a hierarchical case,
+//
+//    11 - 12   21 - 22
+//     |    |    |    |
+//    13 - 14 - 23 - 24
+//          |    |
+//    31 - 32 - 41 - 42
+//     |    |    |    |
+//    33 - 34   43 - 44
+//
+// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
+// (31,32,33,34), (41,42,43,44) as bottoms respectively.
+//
+// We could also use a single communication ring for the flatten case
+//
+// The HCCLComm instance is created and reversed in the HCCLCommContext
+// singleton with a global user specified group id.
+class NPUDeviceContext;
+
+#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
+#define ENV_RANK_ID "PADDLE_TRAINER_ID"
+
+class HCCLComm {
+ public:
+  virtual int ring_id() const = 0;
+  virtual int nranks() const = 0;
+  virtual int rank() const = 0;
+  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
+  virtual aclrtStream stream() const = 0;
+  virtual NPUDeviceContext* dev_context() const = 0;
+  virtual ~HCCLComm() = default;
+};
+
+// A singleton HCCL communicator context reserves communication ring ids
+class HCCLCommContext {
+ public:
+  static HCCLCommContext& Instance() {
+    static HCCLCommContext comm_ctx;
+    return comm_ctx;
+  }
+
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank,
+                           int dev_id, int ring_id);
+  // a latter comm with the same dev_id and the same ring_id
+  // will override the former
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id,
+                           int ring_id);
+
+  // retrieve a communicator by the ring id in multiprocessing mode
+  HCCLComm* Get(int ring_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator in ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1,
+                      platform::errors::InvalidArgument(
+                          "One device id should be specified to retrieve from "
+                          "multiple communicators."));
+    return comm_map_.at(ring_id).begin()->second.get();
+  }
+
+  // retrieve a communicator by the ring id and the device id
+  HCCLComm* Get(int ring_id, int dev_id) const {
+    PADDLE_ENFORCE_GT(
+        comm_map_.count(ring_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator of ring id %d has not been initialized.", ring_id));
+    PADDLE_ENFORCE_GT(
+        comm_map_.at(ring_id).count(dev_id), 0,
+        platform::errors::InvalidArgument(
+            "Communicator at device id %d has not been initialized in ring %d.",
+            dev_id, ring_id));
+    return comm_map_.at(ring_id).at(dev_id).get();
+  }
+
+  // retrieve a communicator by the ring id and place
+  HCCLComm* Get(int ring_id, Place place) const {
+    return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device);
+  }
+
+ private:
+  // Init global hcom
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
+
+  HcclComm comm_;
+
+ public:
+  ~HCCLCommContext() {}
+
+  std::once_flag once_flag_;
+  std::mutex comm_map_mutex_;
+  // ring id to dev-HCCLComm
+  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
+
+  // void InitHcomWorldGroup();
+  void ReleaseHCCLComms();
+
+  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);
+};
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 // In order to apply hierarchical communication with BKCL, we need
 // a communication ring contains BKCL communicators associated to a global
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
new file mode 100644
index 0000000000000..f30e5fa833d44
--- /dev/null
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -0,0 +1,145 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include <utility>
+
+namespace paddle {
+namespace platform {
+
+class HCCLCommImpl : public HCCLComm {
+ public:
+  void set_ring_id(int ring_id) { ring_id_ = ring_id; }
+  int ring_id() const override { return ring_id_; }
+
+  void set_nranks(int nranks) { nranks_ = nranks; }
+  int nranks() const override { return nranks_; }
+
+  void set_rank(int rank) { rank_ = rank; }
+  int rank() const override { return rank_; }
+
+  int device_id() const override {
+    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
+  }
+
+  ~HCCLCommImpl() {
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
+  aclrtStream stream() const override { return dev_ctx_->stream(); }
+
+  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
+    dev_ctx_ = std::move(dev_ctx);
+  }
+  NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); }
+
+ private:
+  int ring_id_;
+  int nranks_;
+  int rank_;
+  HcclComm comm_;
+  std::unique_ptr<NPUDeviceContext> dev_ctx_;
+};
+
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
+  PADDLE_ENFORCE_GT(
+      nranks, 1,
+      platform::errors::InvalidArgument(
+          "Expected nranks > 1. But received nranks is %d.", nranks));
+  PADDLE_ENFORCE_GE(rank, 0,
+                    platform::errors::InvalidArgument(
+                        "Expected rank >= 0. But received rank is %d.", rank));
+  PADDLE_ENFORCE_LT(
+      rank, nranks,
+      platform::errors::InvalidArgument(
+          "Expected rank < nranks. But received rank is %d, nranks is %d.",
+          rank, nranks));
+  PADDLE_ENFORCE_GE(
+      dev_id, 0,
+      platform::errors::InvalidArgument(
+          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
+
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+  VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks
+          << ", hccl_id: " << hccl_id << ", rank: " << rank;
+
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
+
+  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
+          << " has been created on device " << dev_id
+          << ", with comm: " << comm_wrapper->comm();
+
+  std::call_once(once_flag_, []() {
+    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
+  });
+
+  return comm_wrapper;
+}
+
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
+  std::unique_ptr<NPUDeviceContext> dev_ctx(
+      new NPUDeviceContext(NPUPlace(dev_id)));
+
+  HCCLCommImpl* c = new HCCLCommImpl;
+  c->set_ring_id(ring_id);
+  c->set_nranks(nranks);
+  c->set_rank(rank);
+  c->set_comm(comm);
+  c->set_dev_ctx(std::move(dev_ctx));
+
+  comm_map_mutex_.lock();
+  if (comm_map_.count(ring_id) == 0) {
+    comm_map_.emplace(ring_id, std::map<int, std::unique_ptr<HCCLComm>>());
+  }
+  auto& dev2comm = comm_map_[ring_id];
+
+  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
+  comm_map_mutex_.unlock();
+
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }
+
+  return comm_map_[ring_id][dev_id].get();
+}
+
+void HCCLCommContext::ReleaseHCCLComms() {
+  for (auto& p : comm_map_) {
+    for (auto& q : p.second) {
+      q.second.reset();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3d72727c8da8e..f79cb1ab94788 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -189,14 +189,6 @@ class NPUDeviceContext : public DeviceContext {
   /*! \brief  Return npu stream in the device context. */
   aclrtStream stream() const;
 
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  /*! \brief  Return bkcl context. */
-  HCCLContext_t hccl_context() const { return hccl_context_; }
-
-  /*! \brief  Set bkcl context. */
-  void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; }
-#endif
-
   template <typename Callback>
   void AddStreamCallback(Callback&& callback) const {
     return stream_->AddCallback(callback);
@@ -204,11 +196,28 @@ class NPUDeviceContext : public DeviceContext {
 
   void WaitStreamCallback() const { return stream_->WaitCallback(); }
 
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
+#endif
+
+  // template <typename Callback>
+  // void AddStreamCallback(Callback&& callback) const {
+  //   return stream_->AddCallback(callback);
+  // }
+
+  // void WaitStreamCallback() const { return stream_->WaitCallback(); }
+
  private:
   NPUPlace place_;
   aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif
 
   // Need to be the same with other DeviceContext,
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index e65a38cd323aa..b25fb5978d055 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux.
 if (NOT APPLE AND NOT WIN32)
-  list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
+    list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc)
   if (WITH_NCCL)
     list(APPEND CUDA_SRCS nccl.cc)
   endif()
@@ -32,6 +32,8 @@ endif(CUPTI_FOUND)
 if(WITH_ROCM)
   hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
+elseif (WITH_ASCEND_CL)
+  cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc)
 else()
   nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader)
   cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 956acfe2771c5..b49875f256bb2 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "",
               "For instance, /usr/local/cuda/lib64. If default, "
               "dlopen will search cuda from LD_LIBRARY_PATH");
 
+DEFINE_string(hccl_dir, "",
+              "Specify path for loading hccl library, such as libhccl.so. "
+              "For instance, "
+              "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If "
+              "default, "
+              "dlopen will search hccl from LD_LIBRARY_PATH");
+
 DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so.");
 
 DEFINE_string(
@@ -392,6 +399,24 @@ void* GetNCCLDsoHandle() {
                                     warning_msg);
 #endif
 }
+void* GetHCCLDsoHandle() {
+  std::string warning_msg(
+      "You may need to install 'hccl2' from Huawei official website: "
+      "before install PaddlePaddle.");
+#if defined(__APPLE__) || defined(__OSX__)
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {},
+                                    warning_msg);
+#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
+  return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true);
+
+#elif defined(PADDLE_WITH_ASCEND_CL)
+  return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {},
+                                    warning_msg);
+#else
+  return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {},
+                                    warning_msg);
+#endif
+}
 
 void* GetTensorRtDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index c3f5953c78579..8424160931690 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -34,6 +34,7 @@ void* GetNVRTCDsoHandle();
 void* GetCUDADsoHandle();
 void* GetWarpCTCDsoHandle();
 void* GetNCCLDsoHandle();
+void* GetHCCLDsoHandle();
 void* GetTensorRtDsoHandle();
 void* GetMKLMLDsoHandle();
 void* GetOpDsoHandle(const std::string& dso_name);
diff --git a/paddle/fluid/platform/dynload/hccl.cc b/paddle/fluid/platform/dynload/hccl.cc
new file mode 100644
index 0000000000000..5efac7691eb98
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/dynload/hccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag hccl_dso_flag;
+void *hccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+#if HCCL_VERSION_CODE >= 2212
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h
new file mode 100644
index 0000000000000..a56180ce2d4ca
--- /dev/null
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/platform/port.h"
+
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag hccl_dso_flag;
+extern void* hccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name)                           \
+  struct DynLoad__##__name {                                             \
+    template <typename... Args>                                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {         \
+      using HCCL_func = decltype(&::__name);                             \
+      std::call_once(hccl_dso_flag, []() {                               \
+        hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \
+      });                                                                \
+      static void* p_##__name = dlsym(hccl_dso_handle, #__name);         \
+      return reinterpret_cast<HCCL_func>(p_##__name)(args...);           \
+    }                                                                    \
+  };                                                                     \
+  extern DynLoad__##__name __name
+
+#define HCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(HcclReduceScatter);           \
+  __macro(HcclCommDestroy);             \
+  __macro(HcclAllReduce);               \
+  __macro(HcclCommInitRootInfo);        \
+  __macro(HcclGetRootInfo);             \
+  __macro(HcclBroadcast);               \
+  __macro(HcclCommInitClusterInfo);     \
+  __macro(HcclAllGather);
+
+HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+
+#if HCCL_VERSION_CODE >= 2212
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast);
+HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+#if HCCL_VERSION_CODE >= 2703
+#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(HCCLSend);                               \
+  __macro(HCCLRecv);
+HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index e72fbd246cf05..1d105a1fd8682 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -27,7 +27,8 @@ void* tensorrt_plugin_dso_handle;
 
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
-TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP);
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP);
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
 void* GetDsoHandle(const std::string& dso_name) {
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index e9bea9af9ca6e..bc29a0472041a 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle();
 extern std::once_flag tensorrt_plugin_dso_flag;
 extern void* tensorrt_plugin_dso_handle;
 
-#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name)                    \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
     void* operator()(Args... args) {                                          \
@@ -55,6 +55,23 @@ extern void* tensorrt_plugin_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name)                \
+  struct DynLoad__##__name {                                                  \
+    template <typename... Args>                                               \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {          \
+      std::call_once(tensorrt_dso_flag, []() {                                \
+        tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
+      });                                                                     \
+      static void* p_##__name = dlsym(tensorrt_dso_handle, #__name);          \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                     \
+                              platform::errors::Unavailable(                  \
+                                  "Load tensorrt api %s failed", #__name));   \
+      using tensorrt_func = decltype(&::__name);                              \
+      return reinterpret_cast<tensorrt_func>(p_##__name)(args...);            \
+    }                                                                         \
+  };                                                                          \
+  extern DynLoad__##__name __name
+
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
   struct DynLoad__##__name {                                                   \
     template <typename... Args>                                                \
@@ -76,20 +93,25 @@ extern void* tensorrt_plugin_dso_handle;
 #ifdef NV_TENSORRT_MAJOR
 
 #if (NV_TENSORRT_MAJOR >= 6)
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
-  __macro(createInferRuntime_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
+  __macro(createInferRuntime_INTERNAL);             \
   __macro(getPluginRegistry);
 #else
-#define TENSORRT_RAND_ROUTINE_EACH(__macro) \
-  __macro(createInferBuilder_INTERNAL);     \
+#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \
+  __macro(createInferBuilder_INTERNAL);             \
   __macro(createInferRuntime_INTERNAL);
 #endif
 
+#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \
+  __macro(getInferLibVersion);
+
 #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
   __macro(initLibNvInferPlugins);
 
-TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP)
+TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(
+    DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP)
 TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
 
 #endif  // end of NV_TENSORRT_MAJOR
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index f0809d34d493e..cfca3ceadf41a 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -47,6 +47,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL
 
 #include <fstream>
@@ -1220,6 +1221,7 @@ struct NPUStatusType {};
   }
 
 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details
 
 inline std::string build_npu_error_msg(aclError stat) {
@@ -1228,6 +1230,12 @@ inline std::string build_npu_error_msg(aclError stat) {
   return sout.str();
 }
 
+inline std::string build_npu_error_msg(HcclResult stat) {
+  std::ostringstream sout;
+  sout << " HCCL error, the error code is : " << stat << ". ";
+  return sout.str();
+}
+
 #define PADDLE_ENFORCE_NPU_SUCCESS(COND)                       \
   do {                                                         \
     auto __cond__ = (COND);                                    \
diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h
new file mode 100644
index 0000000000000..692f8dbe0bf1e
--- /dev/null
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -0,0 +1,355 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_ASCEND_CL)
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <thread>  // NOLINT
+#include <typeindex>
+#include <unordered_map>
+#include <vector>
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/platform/dynload/hccl.h"
+#endif
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
+
+#define HCCL_ID_VARNAME "HCCLID"
+
+namespace paddle {
+namespace platform {
+
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
+  if (type == framework::proto::VarType::FP32) {
+    return HCCL_DATA_TYPE_FP32;
+  } else if (type == framework::proto::VarType::FP16) {
+    return HCCL_DATA_TYPE_FP16;
+  } else if (type == framework::proto::VarType::INT32) {
+    return HCCL_DATA_TYPE_INT32;
+  } else if (type == framework::proto::VarType::INT8) {
+    return HCCL_DATA_TYPE_INT8;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "This datatype in hccl is not supported."));
+  }
+}
+
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
+//  public:
+//   static std::mutex &HCCLMutex() {
+//     static std::mutex mtx;
+//     return mtx;
+//   }
+
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
+//   }
+
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
+//     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
+//     HCCLMutex().unlock();
+//   }
+// };
+
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;
+
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}
+
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }
+
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }
+
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;
+
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }
+
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};
+
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}
+
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}
+
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }
+
+    return flat_ctxs_[0].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }
+
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }
+
+    return GetHierarchicalInterCtx(run_order);
+  }
+
+  /*
+   When nccl inits nccl comm using ncclCommInitAll, it meets error when
+   allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+   create a new nccl comm for sync_batch_norm_op. And these codes should be
+   polished with a unified nccl management.
+  */
+
+  HCCLContextMap *GetSyncBatchNormCtx(
+      framework::Scope *scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }
+
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }
+
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }
+
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }
+
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }
+
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index b43ad592a3a25..b30214e1d8355 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -61,7 +61,7 @@ set(PYBIND_SRCS
 if(WITH_ASCEND)
   set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper)
   set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc)
-endif(WITH_ASCEND)
+endif()
 
 if(WITH_GLOO)
   set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context)
@@ -86,7 +86,11 @@ endif()
 
 if(WITH_PYTHON)
   # generate op pybind functions automatically for dygraph.
-  set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  if (WITH_ASCEND_CL)
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper)
+  else()
+    set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag)
+  endif()
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
@@ -100,6 +104,7 @@ if(WITH_PYTHON)
 
   add_executable(op_function_generator op_function_generator.cc)
   target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS})
+
   get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(op_function_generator ${os_dependency_modules})
   if(WITH_ROCM)
@@ -153,9 +158,9 @@ if(WITH_PYTHON)
           )
     endif()
   else(WIN32)
-    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, 
+    # If there are no *.so in /usr/lib or LD_LIBRARY_PATH,
     # copy these *.so to current directory and append current directory to
-    # LD_LIBRARY_PATH. This is different with Windows platformm, which search 
+    # LD_LIBRARY_PATH. This is different with Windows platformm, which search
     # *.dll in current directory automatically.
     add_custom_command(TARGET op_function_generator
           POST_BUILD
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc
index 303ab5c0fe8ca..9a1fa1d7704c2 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.cc
+++ b/paddle/fluid/pybind/ascend_wrapper_py.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include <fcntl.h>
 
 #ifdef _POSIX_C_SOURCE
diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h
index e999080544c31..15fb056c90e02 100644
--- a/paddle/fluid/pybind/ascend_wrapper_py.h
+++ b/paddle/fluid/pybind/ascend_wrapper_py.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 0817dc3367162..93441eb52fe5e 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -718,7 +718,8 @@ void BindImperative(py::module *m_ptr) {
                {
                  // Release gil and do tracing
                  py::gil_scoped_release release;
-                 tracer->TraceOp("set_value", ins, outs, std::move(attrs));
+                 tracer->TraceOp("set_value", ins, outs, std::move(attrs),
+                                 {{"Input", "Out"}});
                }
              } else {
                auto self_numpy = TensorToPyArray(*self_tensor);
@@ -745,7 +746,7 @@ void BindImperative(py::module *m_ptr) {
              // inplace operator for the VarBase self.
              self->BumpInplaceVersion();
            })
-      .def("__getitem__",
+      .def("_getitem_index_not_tensor",
            [](std::shared_ptr<imperative::VarBase> &self, py::handle _index) {
              std::vector<int> slice_axes, slice_starts, slice_ends,
                  slice_strides, decrease_axis, infer_flags;
@@ -1487,7 +1488,7 @@ void BindImperative(py::module *m_ptr) {
                  allow_ops);
              imperative::AmpOperators::Instance().GetMutableBlockOps()->swap(
                  block_ops);
-             VLOG(4) << "AMP operators changed, "
+             VLOG(5) << "AMP operators changed, "
                      << imperative::AmpOperators::Instance();
            })
       .def("_get_amp_op_list",
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index dd9cb65142a3d..8a5ad5852aedf 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -467,7 +467,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
       .def("enable_xpu", &AnalysisConfig::EnableXpu,
-           py::arg("l3_workspace_size"))
+           py::arg("l3_workspace_size") = 16 * 1024 * 1024,
+           py::arg("locked") = false, py::arg("autotune") = true,
+           py::arg("autotune_file") = "", py::arg("precision") = "int16",
+           py::arg("adaptive_seqlen") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("use_xpu", &AnalysisConfig::use_xpu)
@@ -512,6 +515,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("dla_core") = 0)
       .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
+      .def("enable_dlnne", &AnalysisConfig::EnableDlnne,
+           py::arg("min_subgraph_size") = 3)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
            py::arg("zero_copy") = false,
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 2c1927f49f6b7..237cec13a8025 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/pybind/pybind.h"
 #include "paddle/fluid/string/string_helper.h"
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/framework/fleet/ascend_wrapper.h"
 #endif
 
@@ -127,10 +127,6 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"c_broadcast", {"Out"}},
     {"c_sync_calc_stream", {"Out"}},
     {"c_sync_comm_stream", {"Out"}},
-    {"c_allreduce_sum", {"Out"}},
-    {"c_allreduce_max", {"Out"}},
-    {"c_allreduce_min", {"Out"}},
-    {"c_allreduce_prod", {"Out"}},
     {"c_reduce_sum", {"Out"}},
     {"c_reduce_max", {"Out"}},
     {"c_reduce_min", {"Out"}},
@@ -182,16 +178,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO
 const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})";
 const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(	
-    if (%s != nullptr) {	
-      ins["%s"] = {%s};	
-    }	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
+    if (%s != nullptr) {
+      ins["%s"] = {%s};
+    }
 )";
 
-const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(	
+const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"(
     if (%s.size() != 0) {
-      ins["%s"] = %s;	
-    }	
+      ins["%s"] = %s;
+    }
 )";
 
 const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"(
@@ -264,8 +260,8 @@ R"(
     imperative::NameVarBaseMap ins = %s;
     %s
     tracer->TraceOp("%s", ins, outs, attrs, {%s});
-    return %s; 
-  }   
+    return %s;
+  }
 })";
 
 const char* PYBIND_ITEM_TEMPLATE = R"(  %s.def("%s", &%s);)";
@@ -350,7 +346,7 @@ std::string GenerateOpFunctionsBody(
   }
   ins_initializer += "}";
 
-  if (input_args.back() == ',') {
+  if (!input_args.empty() && input_args.back() == ',') {
     input_args.pop_back();
   }
 
@@ -364,6 +360,7 @@ std::string GenerateOpFunctionsBody(
   int outs_num = 0;
   for (auto& output : op_proto->outputs()) {
     auto& out_name = output.name();
+
     // skip those dispensable oututs
     if (output.dispensable() && !FindOutsMap(op_type, out_name)) {
       continue;
@@ -459,7 +456,7 @@ std::string GenerateOpFunctionsBody(
     return_str.pop_back();
   }
   outs_initializer += "}";
-  if (inplace_mapping_str.back() == ',') {
+  if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') {
     inplace_mapping_str.pop_back();
   }
   if (!use_inplace_strategy && FindViewOpMap(op_type)) {
@@ -567,7 +564,7 @@ int main(int argc, char* argv[]) {
     return -1;
   }
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   auto ascend_ptr = paddle::framework::AscendInstance::GetInstance();
   ascend_ptr->InitGEForUT();
 #endif
@@ -602,8 +599,9 @@ int main(int argc, char* argv[]) {
 
   out.close();
 
-#ifdef PADDLE_WITH_ASCEND
+#ifdef PADDLE_WITH_ASCEND_CL
   ge::GEFinalize();
 #endif
+
   return 0;
 }
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index f315323ed592e..560d8c892b09f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -268,11 +268,6 @@ bool IsCompiledWithBrpc() {
 #ifndef PADDLE_WITH_DISTRIBUTE
   return false;
 #endif
-
-#ifdef PADDLE_WITH_GRPC
-  return false;
-#endif
-
   return true;
 }
 
@@ -501,7 +496,56 @@ PYBIND11_MODULE(core_noavx, m) {
 #endif
     return tensor;
   });
-
+  m.def("_save_lod_tensor", [](const LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fout), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to save variables.", str_file_name));
+    SerializeToStream(fout, tensor);
+
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_lod_tensor", [](LoDTensor &tensor,
+                               const std::string &str_file_name) {
+    std::ifstream fin(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(static_cast<bool>(fin), true,
+                      platform::errors::Unavailable(
+                          "Cannot open %s to load variables.", str_file_name));
+
+    DeserializeFromStream(fin, &tensor);
+    int64_t tellg = fin.tellg();
+    fin.close();
+    return tellg;
+  });
+  m.def("_save_selected_rows", [](const SelectedRows &selected_rows,
+                                  const std::string &str_file_name) {
+    std::ofstream fout(str_file_name, std::ios::binary);
+    PADDLE_ENFORCE_EQ(
+        static_cast<bool>(fout), true,
+        platform::errors::Unavailable("Cannot open %s to save SelectedRows.",
+                                      str_file_name));
+
+    SerializeToStream(fout, selected_rows);
+    int64_t tellp = fout.tellp();
+    fout.close();
+    return tellp;
+  });
+  m.def("_load_selected_rows",
+        [](SelectedRows &selected_rows, const std::string &str_file_name) {
+          std::ifstream fin(str_file_name, std::ios::binary);
+          PADDLE_ENFORCE_EQ(
+              static_cast<bool>(fin), true,
+              platform::errors::Unavailable(
+                  "Cannot open %s to load SelectedRows.", str_file_name));
+
+          DeserializeFromStream(fin, &selected_rows);
+          int64_t tellg = fin.tellg();
+          fin.close();
+          return tellg;
+        });
   m.def("_save_static_dict",
         [](const std::string &str_file_name, const py::handle &vec_var_list,
            const Scope &scope) {
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 20c8794ba634c..787f5297e7405 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -26,10 +26,30 @@ set cache_dir=%work_dir:Paddle=cache%
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
-taskkill /f /im op_function_generator.exe
-wmic process where name="op_function_generator.exe" call terminate
+taskkill /f /im op_function_generator.exe  2>NUL
+taskkill /f /im cmake.exe  2>NUL
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im CL.exe 2>NUL
+taskkill /f /im Lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
 taskkill /f /im python.exe  2>NUL
-
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+wmic process where name="python.exe" call terminate 2>NUL
 
 rem ------initialize common variable------
 if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64"
@@ -54,6 +74,7 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_
 if not defined LOG_LEVEL set LOG_LEVEL=normal
 if not defined PRECISION_TEST set PRECISION_TEST=OFF
 if not defined NIGHTLY_MODE set PRECISION_TEST=OFF
+if not defined retry_times set retry_times=2
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
@@ -162,11 +183,10 @@ rem ------show summary of current environment----------
 cmake --version
 if "%WITH_GPU%"=="ON" (
     nvcc --version
-    where nvidia-smi
     nvidia-smi
 )
-python %work_dir%\tools\summary_env.py
-%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
+::python %work_dir%\tools\summary_env.py
+::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh
 
 goto :CASE_%1
 
@@ -195,6 +215,7 @@ rem ------PR CI windows check for OPENBLAS/CPU------
 set WITH_MKL=ON
 set WITH_GPU=OFF
 set MSVC_STATIC_CRT=ON
+set retry_times=1
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -209,6 +230,7 @@ rem ------Build windows avx whl package------
 set WITH_AVX=ON
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -220,6 +242,7 @@ rem ------Build windows no-avx whl package------
 set WITH_AVX=OFF
 set ON_INFER=OFF
 set CUDA_ARCH_NAME=All
+set retry_times=4
 
 call :cmake || goto cmake_error
 call :build || goto build_error
@@ -249,6 +272,8 @@ echo    ========================================
 rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler.
 call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"
 set DISTUTILS_USE_SDK=1
+rem Windows 10 Kit bin dir
+set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH%
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -342,7 +367,7 @@ if %GENERATOR% == "Ninja" (
 )
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1  
-    if %build_times% GTR 2 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build third_party failed, will retry!
@@ -356,6 +381,28 @@ set build_times=1
 :: reset clcache zero stats for collect PR's actual hit rate
 rem clcache.exe -z
 
+rem -------clean up environment again-----------
+taskkill /f /im MSBuild.exe 2>NUL
+taskkill /f /im cl.exe 2>NUL
+taskkill /f /im lib.exe 2>NUL
+taskkill /f /im link.exe 2>NUL
+taskkill /f /im vctip.exe 2>NUL
+taskkill /f /im cvtres.exe 2>NUL
+taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
+wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
+
 echo Build Paddle the %build_times% time:
 if %GENERATOR% == "Ninja" (
     ninja -j %PARALLEL_PROJECT_COUNT%
@@ -369,7 +416,7 @@ if %GENERATOR% == "Ninja" (
 
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
-    if %build_times% GTR 1 (
+    if %build_times% GTR %retry_times% (
         exit /b 7
     ) else (
         echo Build Paddle failed, will retry!
@@ -706,9 +753,21 @@ taskkill /f /im git-remote-https.exe 2>NUL
 taskkill /f /im vctip.exe 2>NUL
 taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
+taskkill /f /im mspdbsrv.exe 2>NUL
+taskkill /f /im csc.exe 2>NUL
+taskkill /f /im python.exe  2>NUL
+taskkill /f /im nvcc.exe 2>NUL
+taskkill /f /im cicc.exe 2>NUL
+taskkill /f /im ptxas.exe 2>NUL
+taskkill /f /im test_api_impl.exe 2>NUL
+taskkill /f /im op_function_generator.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
+wmic process where name="test_api_impl.exe" call terminate 2>NUL
+wmic process where name="cvtres.exe" call terminate 2>NUL
+wmic process where name="rc.exe" call terminate 2>NUL
+wmic process where name="CL.exe" call terminate 2>NUL
+wmic process where name="Lib.exe" call terminate 2>NUL
 wmic process where name="python.exe" call terminate 2>NUL
-taskkill /f /im python.exe  2>NUL
 echo Windows CI run successfully!
 exit /b 0
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 88056a6eb9380..b8b9f40aa33fc 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -145,6 +145,18 @@ function cmake_base() {
             else
                 exit 1
             fi
+        elif [ "$1" == "cp39-cp39" ]; then
+            if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib"
+                pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt
+            else
+                exit 1
+            fi
         fi
     else
         if [ "$1" != "" ]; then
@@ -205,6 +217,13 @@ function cmake_base() {
             -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8
             -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so"
                 pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt
+            elif [ "$1" == "cp39-cp39" ]; then
+                export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH}
+                export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH}
+                export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.9.0/bin/python3.9
+            -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9
+            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so"
+                pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt
            elif [ "$1" == "conda-python3.7" ]; then
                 export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH}
                 export PATH=/opt/conda/bin/:${PATH}
@@ -227,7 +246,6 @@ function cmake_base() {
     fi
 
     distibuted_flag=${WITH_DISTRIBUTE:-OFF}
-    grpc_flag="OFF"
     gloo_flag=${distibuted_flag}
 
     cat <<EOF
@@ -249,13 +267,11 @@ function cmake_base() {
         -DWITH_INCREMENTAL_COVERAGE=${WITH_INCREMENTAL_COVERAGE:-OFF}
         -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DPY_VERSION=${PY_VERSION:-2.7}
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
-        -DWITH_GRPC=${grpc_flag}
         -DWITH_PSCORE=${distibuted_flag}
         -DWITH_GLOO=${gloo_flag}
         -DWITH_LITE=${WITH_LITE:-OFF}
@@ -263,6 +279,7 @@ function cmake_base() {
         -DLITE_GIT_TAG=release/v2.8
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+        -DWITH_STRIP=${WITH_STRIP:-ON}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -292,7 +309,6 @@ EOF
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DPY_VERSION=${PY_VERSION:-2.7} \
         -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \
-        -DWITH_GRPC=${grpc_flag} \
         -DWITH_PSCORE=${distibuted_flag} \
         -DWITH_GLOO=${gloo_flag} \
         -DLITE_GIT_TAG=release/v2.8 \
@@ -300,6 +316,7 @@ EOF
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
+        -DWITH_STRIP=${WITH_STRIP:-ON} \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -521,8 +538,7 @@ function run_brpc_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
     if [[ ${WITH_TESTING:-ON} == "ON" \
-        && ${WITH_DISTRIBUTE:-OFF} == "ON" \
-        && ${WITH_GRPC:-OFF} == "OFF" ]] ; then
+        && ${WITH_DISTRIBUTE:-OFF} == "ON" ]] ; then
     cat <<EOF
     ========================================
     Running brpc unit tests ...
@@ -585,6 +601,8 @@ EOF
             pip3.7 uninstall -y paddlepaddle
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 uninstall -y paddlepaddle
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 uninstall -y paddlepaddle
         fi
         set -ex
 
@@ -600,6 +618,8 @@ EOF
             pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         elif [ "$1" == "cp38-cp38" ]; then
             pip3.8 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+        elif [ "$1" == "cp39-cp39" ]; then
+            pip3.9 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         fi
         tmpfile_rand=`date +%s%N`
         tmpfile=$tmp_dir/$tmpfile_rand
@@ -798,7 +818,7 @@ function generate_api_spec() {
 
     awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc
     awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api
-    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then
+    if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then
         # Use sed to make python2 and python3 sepc keeps the same
         sed -i 's/arg0: str/arg0: unicode/g' $spec_path
         sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path
@@ -1535,12 +1555,14 @@ EOF
     ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
     ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
     ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
     ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
     ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl
 
     if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then
         ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
@@ -1548,11 +1570,13 @@ EOF
         ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
         ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
         ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
         ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
         ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
         ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+        ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl
     fi
 
     #ref_paddle2_mv1=""
@@ -1673,6 +1697,22 @@ EOF
         apt-get clean -y && \
         rm -f ${ref_paddle38} && \
         ldconfig
+EOF
+    cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
+    # run paddle version to install python packages first
+    RUN apt-get update && ${NCCL_DEPS}
+    RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
+        libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
+        xz-utils tk-dev libffi-dev liblzma-dev
+    RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+        tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+        CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+        make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.9.0.tgz
+    RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
+        wget ${ref_web}/${ref_paddle39} && pip3.9 install ${ref_paddle39_whl}; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f ${ref_paddle39} && \
+        ldconfig
 EOF
     cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
     # run paddle version to install python packages first
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a886f7a029837..1f11be7e3c726 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -29,8 +29,7 @@ int main(int argc, char** argv) {
 
   std::vector<std::string> envs;
   std::vector<std::string> undefok;
-#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) && \
-    !defined(PADDLE_WITH_PSLIB)
+#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB)
   std::string str_max_body_size;
   if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size",
                                                &str_max_body_size)) {
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 861839256a3da..94091c94bb533 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -45,6 +45,7 @@
 from paddle.framework.dtype import complex64
 from paddle.framework.dtype import complex128
 from .framework import VarBase as Tensor
+Tensor.__qualname__ = 'Tensor'
 import paddle.compat
 import paddle.distributed
 import paddle.sysconfig
@@ -149,6 +150,7 @@
 from .tensor.manipulation import unbind  #DEFINE_ALIAS
 from .tensor.manipulation import roll  #DEFINE_ALIAS
 from .tensor.manipulation import chunk  #DEFINE_ALIAS
+from .tensor.manipulation import tolist  #DEFINE_ALIAS
 from .tensor.math import abs  #DEFINE_ALIAS
 from .tensor.math import acos  #DEFINE_ALIAS
 from .tensor.math import asin  #DEFINE_ALIAS
@@ -256,6 +258,7 @@
 
 from .framework import grad  #DEFINE_ALIAS
 from .framework import no_grad  #DEFINE_ALIAS
+from .framework import set_grad_enabled  #DEFINE_ALIAS
 from .framework import save  #DEFINE_ALIAS
 from .framework import load  #DEFINE_ALIAS
 from .framework import DataParallel  #DEFINE_ALIAS
@@ -297,6 +300,8 @@
 from .hapi import callbacks
 from .hapi import summary
 from .hapi import flops
+from .hapi import hub
+
 import paddle.text
 import paddle.vision
 
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 64b34ce834563..72a67a92c4958 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -62,6 +62,7 @@ class GradScaler(AmpScaler):
             scaled = scaler.scale(loss)  # scale the loss 
             scaled.backward()            # do backward
             scaler.minimize(optimizer, scaled)  # update parameters     
+            optimizer.clear_grad()
     """
 
     def __init__(self,
@@ -105,6 +106,7 @@ def scale(self, var):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).scale(var)
 
@@ -140,5 +142,6 @@ def minimize(self, optimizer, *args, **kwargs):
                 scaled = scaler.scale(loss)  # scale the loss 
                 scaled.backward()            # do backward
                 scaler.minimize(optimizer, scaled)  # update parameters  
+                optimizer.clear_grad()
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index c093565dc92ff..35e2cd2439177 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -176,8 +176,9 @@ def backward(ctx, dy):
 
 class PyLayerBackward(PyLayerContext):
     def backward(self, *args, **kwargs):
-        with paddle.fluid.dygraph.no_grad():
-            return self._forward_cls.backward(*args, **kwargs)
+        with paddle.fluid.dygraph.guard():
+            with paddle.fluid.dygraph.no_grad():
+                return self._forward_cls.backward(*args, **kwargs)
 
 
 class LayerMeta(type):
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 2756dea72e84a..c0feadb68838d 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -397,23 +397,22 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         return
 
     ring_id = 0 if group is None else group.id
-
     if in_dygraph_mode():
         if op == ReduceOp.SUM:
-            return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MAX:
-            return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_max_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.MIN:
-            return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream',
-                                            use_calc_stream, 'ring_id', ring_id)
+            return core.ops.c_allreduce_min_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         elif op == ReduceOp.PROD:
-            return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream',
-                                             use_calc_stream, 'ring_id',
-                                             ring_id)
+            return core.ops.c_allreduce_prod_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id)
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
+        return out
 
     check_variable_and_dtype(
         tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
@@ -692,6 +691,153 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
         })
 
 
+def _c_identity(tensor, group=None):
+    """
+    Return a copy of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        return core.ops.c_identity(tensor, 'use_calc_stream', True, 'ring_id',
+                                   ring_id, 'use_model_parallel', True)
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+        })
+    return out
+
+
+def _c_concat(tensor, nranks, group=None):
+    """
+    Return allgather of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
+                                 True, 'nranks', nranks, 'use_model_parallel',
+                                 True)
+
+    op_type = 'c_concat'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_concat')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True,
+            'nranks': nranks
+        })
+    return out
+
+
+def _c_split(tensor, rank, nranks, group=None):
+    """
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                ring_id, 'rank', rank, 'nranks', nranks,
+                                'use_model_parallel', True)
+
+    op_type = 'c_split'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'X': tensor},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'rank': rank,
+            'nranks': nranks,
+            'use_model_parallel': True,
+        })
+    return out
+
+
+def _mp_allreduce(tensor,
+                  op=ReduceOp.SUM,
+                  group=None,
+                  use_calc_stream=True,
+                  use_model_parallel=True):
+    """[it is same as allreduce above, but it suuports model parallel. And it support inplace startegy]
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        if op == ReduceOp.SUM:
+            return core.ops.c_allreduce_sum_(
+                tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id,
+                "use_model_parallel", use_model_parallel)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+    else:
+        raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
+
+
 def barrier(group=None):
     """
 
@@ -732,15 +878,31 @@ def barrier(group=None):
         attrs={'ring_id': ring_id})
 
 
-def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
-                     gather_out, inner_rank, name):
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=None):
     """
     Parallel Linear
     """
-    if not name:
-        name = "fc_by_row_rank_%d" % inner_rank if axis == 0 else "fc_by_col_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, inner_rank, nranks, group=group)
     else:
-        name = name + "_by_row_rank_%d" % inner_rank if axis == 0 else name + "_by_col_rank_%d" % inner_rank
+        x = _c_identity(x, group=group)
+
     linear = paddle.nn.Linear(
         num_rows,
         num_cols,
@@ -748,33 +910,63 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr,
         bias_attr=bias_attr,
         name=name)
 
-    weight = linear.weight
-    weight.is_distributed = True
     linear_out = linear(x)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    if gather_out:
-        if axis == 0:
-            paddle.distributed.all_reduce(linear_out)
-        else:
-            output = []
-            paddle.distributed.all_gather(output, linear_out)
-            linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1)
-    return linear_out
+    startup_block.vars[linear.weight.name].is_distributed = True
+    main_block.vars[linear.weight.name].is_distributed = True
+
+    if not gather_out: return linear_out
+
+    op_type = 'c_allreduce_sum' if axis == 0 else 'c_concat'
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    else:
+        main_block.append_op(
+            type='c_concat',
+            inputs={'X': linear_out},
+            outputs={'Out': out},
+            attrs={
+                'ring_id': ring_id,
+                'nranks': nranks,
+                'use_calc_stream': True,
+                'use_model_parallel': True
+            })
+    return out
 
 
-def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
-                        inner_rank, num_partitions, name):
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=None):
     """
     Parallel Embedding
     """
-    if not name:
-        name = "emb_rank_%d" % inner_rank
-    else:
-        name = name + "_rank_%d" % inner_rank
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
 
     origin_num_embeddings = origin_size[0]
     embedding = paddle.nn.Embedding(
@@ -795,15 +987,29 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr,
                                  inner_rank, per_part_embeddings - 1)
     if len(origin_input_shape) == 2:
         x_shard = paddle.squeeze(x_shard, axis=-1)
-
-    embedding.weight.is_distributed = True
     emb_out = embedding(x_shard)
     startup_block = paddle.static.default_startup_program().global_block()
     main_block = paddle.static.default_main_program().global_block()
     startup_block.vars[embedding.weight.name].is_distributed = True
     main_block.vars[embedding.weight.name].is_distributed = True
-    paddle.distributed.all_reduce(emb_out, group=None)
-    return emb_out
+    out = main_block.create_var(
+        shape=emb_out.shape,
+        dtype=emb_out.dtype,
+        type=emb_out.type,
+        lod_level=emb_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=emb_out.desc.need_check_feed())
+    main_block.append_op(
+        type='c_allreduce_sum',
+        inputs={'X': emb_out},
+        outputs={'Out': out},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': True,
+            'use_model_parallel': True
+        })
+    return out
 
 
 def split(x,
@@ -896,8 +1102,10 @@ def split(x,
         "paddle.distributed.split must be one of {}.".format(
             supported_operations))
     if in_dygraph_mode():
-        rank = paddle.distributed.get_rank()
-        nranks = paddle.distributed.get_world_size()
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
     else:
         assert fleet._role_maker, ("To use paddle.distributed.split, "
                                    "you must call fleet.init() firstly.")
@@ -915,10 +1123,18 @@ def split(x,
         if inner_rank == num_partitions - 1: per_part_size = last_part_size
         per_part_size += 1  # make the last row as the padding index
 
-        emb_out = _parallel_embedding(x, per_part_size, size, weight_attr,
-                                      inner_rank, num_partitions, name)
+        emb_out = _parallel_embedding(
+            x,
+            per_part_size,
+            size,
+            weight_attr,
+            inner_rank,
+            num_partitions,
+            name,
+            group=None)
         return emb_out
     else:
+        should_split = False
         if axis == 0:
             assert size[0] % num_partitions == 0, (
                 "Number of rows of the weight for linear ({}) must be"
@@ -926,11 +1142,7 @@ def split(x,
                                                            num_partitions))
             per_part_size = size[0] // num_partitions
             linear_size = (per_part_size, size[1])
-            assert x.shape[-1] == per_part_size, (
-                "The width ({}) of the input "
-                "x must be equal to the height ({}) of the weight. Maybe you "
-                "should split the input x using paddle.split.".format(
-                    x.shape[-1], per_part_size))
+            if x.shape[-1] == size[0]: should_split = True
 
         elif axis == 1:
             assert size[1] % num_partitions == 0, (
@@ -952,5 +1164,8 @@ def split(x,
             bias_attr,
             gather_out,
             inner_rank,
-            name=name)
+            num_partitions,
+            should_split,
+            name=name,
+            group=None)
         return linear_out
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 784004269d797..403a02496afaa 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -21,7 +21,7 @@
 from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator
 from . import metrics
 from .base.topology import CommunicateTopology, HybridCommunicateGroup
-from .meta_parallel import random, layers
+from .meta_parallel import *
 
 __all__ = [
     "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker",
@@ -74,3 +74,4 @@
 set_state_dict = fleet.set_state_dict
 shrink = fleet.shrink
 get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
+distributed_scaler = fleet.distributed_scaler
diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py
index 7a4a4a189c92e..b64149f27bcac 100644
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -63,7 +63,6 @@ def _get_ascend_rankfile(rank_table_file_path):
     Returns:
         node_ips: node ip list
         device_count: number of npu per machine
-        
     """
     json_data = None
     with open(rank_table_file_path) as json_file:
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 443c5a2954b0c..9fed3a8550c40 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -891,6 +891,58 @@ def pipeline_configs(self, configs):
                           "pipeline_configs")
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
+    @property
+    def tensor_parallel(self):
+        """
+        Indicating whether we are using tensor parallel for distributed training.
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+
+        """
+        return self.strategy.tensor_parallel
+
+    @tensor_parallel.setter
+    @is_strict_auto
+    def tensor_parallel(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.tensor_parallel = flag
+        else:
+            print("WARNING: tensor_parallel should have value of bool type")
+
+    @property
+    def tensor_parallel_configs(self):
+        """
+        Set tensor_parallel configurations.
+
+        **Notes**:
+            **Detailed arguments for tensor_parallel_configs**
+            **tensor_parallel_degree**: degree of tensor parallel
+
+        Examples:
+
+          .. code-block:: python
+
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4}
+
+        """
+        return get_msg_dict(self.strategy.tensor_parallel_configs)
+
+    @tensor_parallel_configs.setter
+    @is_strict_auto
+    def tensor_parallel_configs(self, configs):
+        check_configs_key(self.strategy.tensor_parallel_configs, configs,
+                          "tensor_parallel_configs")
+        assign_configs_value(self.strategy.tensor_parallel_configs, configs)
+
     @property
     def hybrid_configs(self):
         """
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 5e17794dfeac1..9e200f4ee5f6e 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -29,7 +29,9 @@
 from . import topology as tp
 from .topology import ParallelMode
 from ..meta_parallel import ModelParallel
+from ..meta_parallel import PipelineParallel
 from ..meta_optimizers import HybridParallelOptimizer
+from ..meta_optimizers import HybridParallelGradScaler
 
 
 def _inited_runtime_handler_(func):
@@ -779,6 +781,9 @@ def forward(self, x):
         elif self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL:
             distributed_model = ModelParallel(
                 model, self._hcg, strategy=self._user_defined_strategy)
+        elif self._hcg.get_parallel_mode() == ParallelMode.PIPELINE_PARALLEL:
+            distributed_model = PipelineParallel(
+                model, self._hcg, strategy=self._user_defined_strategy)
         return distributed_model
 
     @dygraph_only
@@ -1058,6 +1063,8 @@ def _get_amp_optimizer(self):
         return amp_optimizer
 
     def get_loss_scaling(self):
+        """Return the real-time loss scaling factor.
+        """
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.get_loss_scaling()
 
@@ -1333,3 +1340,7 @@ def minimize(self,
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
+
+    @dygraph_only
+    def distributed_scaler(self, scaler):
+        return HybridParallelGradScaler(scaler, self._hcg)
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index d26dee331ccf5..8f38ba447fcb3 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -19,6 +19,8 @@
 import numpy as np
 from itertools import product
 from functools import reduce
+from ..utils.log_util import logger
+
 __all__ = ['CommunicateTopology', 'HybridCommunicateGroup']
 
 _HYBRID_PARALLEL_GROUP = None
@@ -118,6 +120,7 @@ def __init__(self, topology):
 
         self._data_parallel_id = self._get_data_parallel_id()
         self._model_parallel_id = self._get_model_parallel_id()
+        self.stage_id = self._get_pipe_parallel_id()
 
         assert self._check_vaild_topo(
         ), "Here is an unreasonable topogy setting. world_size: {}, but" \
@@ -129,12 +132,24 @@ def __init__(self, topology):
 
         # create comm group for model parallel
         self._mp_group, self._mp_comm_group = self._set_comm_group("model")
+
+        # create comm group for pipe parallel
+        self._pp_group, self._pp_comm_group = self._set_comm_group("pipe")
+
+        # create global group for check inf_nan / clip global norm
+        self._check_group, self._check_comm_group = self._set_check_group(
+            "data")
+
+        # create p2p group
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+
         debug_str = "HybridParallelInfo: rank_id: %d, dp_degree: %d, " \
-                    "mp_degree: %d, pp_degree: %d\n" % (self.global_rank, self._dp_degree,
+                    "mp_degree: %d, pp_degree: %d" % (self.global_rank, self._dp_degree,
                     self._mp_degree,self._pp_degree)
-        debug_str += "dp_group: %s, mp_group: %s" % (self._dp_group,
-                                                     self._mp_group)
-        print(debug_str, file=sys.stderr)
+        debug_str += "dp_group: %s, mp_group: %s, pp_group: %s, check/clip group: %s" % (
+            self._dp_group, self._mp_group, self._pp_group, self._check_group)
+        logger.info(debug_str)
 
         global _HYBRID_PARALLEL_GROUP
         _HYBRID_PARALLEL_GROUP = self
@@ -168,6 +183,22 @@ def _set_comm_group(self, parallel_method="data"):
 
         return parallel_group, parallel_comm_group
 
+    def _set_check_group(self, parallel_method="data"):
+        parallel_group = []
+        parallel_comm_group = None
+        parallel_size = self._topo.get_dim(parallel_method)
+        for idx in range(parallel_size):
+            parallel_groups = self._topo.get_axis_list(parallel_method, idx)
+            comm_group = paddle.distributed.new_group(ranks=parallel_groups)
+            if self.global_rank in parallel_groups:
+                parallel_group = parallel_groups
+                parallel_comm_group = comm_group
+
+        assert len(parallel_group) > 0
+        assert parallel_comm_group is not None
+
+        return parallel_group, parallel_comm_group
+
     def topology(self):
         return self._topo
 
@@ -205,3 +236,20 @@ def get_model_parallel_group(self):
 
     def get_model_parallel_group_src_rank(self):
         return self._mp_comm_group.ranks[0]
+
+    # pipeline parallel message
+    def _get_pipe_parallel_id(self):
+        return self._topo.get_coord(self.global_rank).pipe
+
+    def get_stage_id(self):
+        return self.stage_id
+
+    def get_pipe_parallel_world_size(self):
+        return self._pp_degree
+
+    def get_pipe_parallel_group(self):
+        return self._pp_comm_group
+
+    # check parallel group
+    def get_check_parallel_group(self):
+        return self._check_comm_group
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 8dd57c87ef896..827835fde20e3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -26,3 +26,5 @@
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
 from .sharding_optimizer import ShardingOptimizer
 from .dygraph_optimizer import HybridParallelOptimizer
+from .dygraph_optimizer import HybridParallelGradScaler
+from .tensor_parallel_optimizer import TensorParallelOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index a7f938647ad71..9e2723dad729a 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
@@ -77,6 +78,7 @@ def _init_communicator(self,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
@@ -163,6 +165,33 @@ def _add_sync_by_allreduce(block):
                     'ring_id': ring_id,
                     OP_ROLE_KEY: OpRole.Forward
                 })
+        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
         else:
             raise ValueError(
                 "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
index a2a3bb8d17201..4e41723cb622d 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py
@@ -11,3 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 from .hybrid_parallel_optimizer import HybridParallelOptimizer
+from .hybrid_parallel_gradscaler import HybridParallelGradScaler
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
new file mode 100644
index 0000000000000..11bb897a678b7
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+from paddle.optimizer import Optimizer
+from ...base.topology import ParallelMode
+from paddle.fluid.dygraph import base as imperative_base
+from paddle.fluid import framework
+from paddle.fluid.framework import Variable
+import types
+from paddle.fluid import core
+import paddle
+
+
+class HybridParallelGradScaler:
+    def __init__(self, scaler, hcg):
+        self._scaler = scaler
+        self._hcg = hcg
+        self._is_mp = (
+            self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
+
+    def scale(self, var):
+        return self._scaler.scale(var)
+
+    def minimize(self, optimizer, *args, **kwargs):
+        if not self._enable:
+            return optimizer.minimize(*args, **kwargs)
+
+        #  unscale the grad
+        self._unscale(optimizer)
+
+        optimize_ops, params_grads = (None, None)
+
+        if self._found_inf:
+            self._cache_founf_inf = True
+        else:
+            optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
+            self._cache_founf_inf = False
+
+        if self._use_dynamic_loss_scaling:
+            self._update()
+
+        return optimize_ops, params_grads
+
+    @imperative_base.no_grad
+    def _unscale(self, optimizer):
+        if not self._enable:
+            return
+        param_grads = [
+            param._grad_ivar() for param in optimizer._parameter_list
+            if param._grad_ivar() is not None
+        ]
+        core.ops.check_finite_and_unscale(param_grads, self._scale, param_grads,
+                                          self._found_inf)
+        # allreduce_max found_inf in check_group
+        if self._is_mp:
+            self._found_inf = paddle.cast(self._found_inf, dtype="int32")
+            paddle.distributed.all_reduce(
+                self._found_inf,
+                op=paddle.distributed.ReduceOp.MAX,
+                group=self._hcg.get_check_parallel_group())
+            self._found_inf = paddle.cast(self._found_inf, dtype="bool")
+
+    def __getattr__(self, item):
+        return getattr(self._scaler, item)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
index b1cf98b4b1d2f..52e87173684a3 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -12,15 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
+import sys
 from paddle.optimizer import Optimizer
+from paddle.fluid.clip import ClipGradByGlobalNorm
 from ...utils.hybrid_parallel_util import fused_allreduce_gradients
 from ...base.topology import ParallelMode
 from paddle.fluid.dygraph import base as imperative_base
 from paddle.fluid import framework
 from paddle.fluid.framework import Variable
+from ...utils.log_util import logger
+
+
+class HybridParallelClipGrad:
+    def __init__(self, clip, hcg):
+        self._clip = clip
+        self._hcg = hcg
+
+    @imperative_base.no_grad
+    def _dygraph_clip(self, params_grads):
+        params_and_grads = []
+        sum_square_list = []
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                continue
+            merge_grad = g
+            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
+                merge_grad = layers.merge_selected_rows(g)
+                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
+            square = layers.square(merge_grad)
+            sum_square = layers.reduce_sum(square)
+            sum_square_list.append(sum_square)
+
+        # all parameters have been filterd out
+        if len(sum_square_list) == 0:
+            return params_grads
+
+        global_norm_var = layers.concat(sum_square_list)
+        global_norm_var = layers.reduce_sum(global_norm_var)
+        # add all reduce to get global norm in world size
+        paddle.distributed.all_reduce(global_norm_var,
+                                      self._hcg.get_check_parallel_group())
+        global_norm_var = layers.sqrt(global_norm_var)
+
+        max_global_norm = layers.fill_constant(
+            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
+        clip_var = layers.elementwise_div(
+            x=max_global_norm,
+            y=layers.elementwise_max(
+                x=global_norm_var, y=max_global_norm))
+        for p, g in params_grads:
+            if g is None:
+                continue
+            if getattr(p, 'need_clip', True) is False:
+                params_and_grads.append((p, g))
+                continue
+            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            params_and_grads.append((p, new_grad))
+
+        return params_and_grads
+
+    def __getattr__(self, item):
+        return getattr(self._clip, item)
+
+    def __call__(self, params_grads):
+        return self._clip(params_grads)
 
 
 class HybridParallelOptimizer:
+    # adapter wrapper for optimizer
     def __init__(self, optimizer, hcg, strategy):
         self._inner_opt = optimizer
         self._strategy = strategy
@@ -29,6 +91,13 @@ def __init__(self, optimizer, hcg, strategy):
             self._hcg.get_parallel_mode() == ParallelMode.MODEL_PARALLEL)
         self._need_dp = (self._hcg.get_data_parallel_world_size() > 1)
 
+        if isinstance(self._inner_opt._grad_clip,
+                      ClipGradByGlobalNorm) and self._is_mp:
+            logger.warning("using ClipGradByGlobalNorm in ModelParallel, the origin " \
+                  "optmizer'grad clip will be changed.")
+            self._inner_opt._grad_clip = HybridParallelClipGrad(
+                self._inner_opt._grad_clip, hcg)
+
     @imperative_base.no_grad
     @framework.dygraph_only
     def step(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
index ae2daa9b9d859..1aa51a6671c17 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -13,6 +13,7 @@
 
 from __future__ import print_function
 from __future__ import division
+import os
 
 import paddle.fluid as fluid
 from paddle.fluid import core, unique_name
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index 2c4ad33c361e0..852421523b15b 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -365,8 +365,8 @@ def minimize_impl(self,
                   'w') as f:
             f.writelines(str(main_block.program))
 
-        self._wait()
-
+        if core.is_compiled_with_cuda():
+            self._wait()
         return optimize_ops, params_grads
 
     def _init_comm(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
new file mode 100644
index 0000000000000..2ba0195156082
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/tensor_parallel_optimizer.py
@@ -0,0 +1,231 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from __future__ import print_function
+from __future__ import division
+
+import paddle.fluid as fluid
+from paddle.fluid import core, unique_name
+from .meta_optimizer_base import MetaOptimizerBase
+from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op
+
+
+class TensorParallelOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(TensorParallelOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+            "LarsOptimizer",
+            "LambOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self.mp_ring_id = 0
+        self.global_ring_id = 1
+        self.dp_ring_id = 2
+
+    def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
+                        user_defined_strategy):
+        super(TensorParallelOptimizer, self)._set_basic_info(
+            loss, role_maker, user_defined_optimizer, user_defined_strategy)
+        self.mp_degree = user_defined_strategy.tensor_parallel_configs[
+            'tensor_parallel_degree']
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+
+        if self.user_defined_strategy.tensor_parallel == True:
+            return True
+        return False
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.tensor_parallel = False
+        dist_strategy.tensor_parallel_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.tensor_parallel = True
+        dist_strategy.tensor_parallel_configs = {"tensor_parallel_degree": 1, }
+
+    def _broadcast_params(self, ring_id, mp_mode):
+        block = self.startup_program.global_block()
+        param = None
+        for param in block.iter_parameters():
+            if param.is_distributed and mp_mode:
+                continue
+
+            block.append_op(
+                type='c_broadcast',
+                inputs={'X': param},
+                outputs={'Out': param},
+                attrs={
+                    'ring_id': ring_id,
+                    'root': 0,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+
+        if not param: return  # no parameter on this device
+        block.append_op(
+            type='c_sync_comm_stream',
+            inputs={'X': param},
+            outputs={'Out': param},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Forward})
+
+    def _get_process_group_info(self):
+        # global ring info
+        self.global_endpoints = self.endpoints
+        self.global_rank = self.rank
+        self.global_nranks = self.nranks
+
+        # model parallel ring info
+        self.mp_rank = self.rank % self.mp_degree
+        self.mp_nranks = self.mp_degree
+        mp_group = self.rank // self.mp_degree
+        self.mp_endpoints = [
+            self.endpoints[i] for i in range(self.global_nranks)
+            if i // self.mp_degree == mp_group
+        ]
+
+        # data parallel ring info
+        if self.nranks > self.mp_degree:
+            self.dp_rank = self.rank // self.mp_degree
+            self.dp_nranks = self.nranks // self.mp_degree
+            start_index = self.rank % self.mp_degree
+            self.dp_endpoints = [
+                self.endpoints[start_index + i * self.mp_degree]
+                for i in range(self.dp_nranks)
+            ]
+
+    def _init_process_group(self):
+        self._get_process_group_info()
+        collective_helper = CollectiveHelper(self.role_maker, wait_port=False)
+
+        # Create global ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.global_endpoints,
+            self.global_rank, self.global_ring_id, True, self.global_ring_id,
+            True)
+
+        # Create model parallel ring for all gpus
+        collective_helper._init_communicator(
+            self.startup_program, self.current_endpoint, self.mp_endpoints,
+            self.mp_rank, self.mp_ring_id, True, self.global_ring_id, True)
+        #self._broadcast_params(self.mp_ring_id, mp_mode=True)
+
+        # Create dp rings
+        if self.nranks > self.mp_degree:
+            collective_helper._init_communicator(
+                self.startup_program, self.current_endpoint, self.dp_endpoints,
+                self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True)
+            self._broadcast_params(self.dp_ring_id, mp_mode=False)
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self.endpoints = self.role_maker._get_trainer_endpoints()
+        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
+        self.startup_program = startup_program
+        if startup_program is None:
+            self.startup_program = fluid.default_startup_program()
+
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, self.startup_program, parameter_list, no_grad_set)
+
+        self.main_program = loss.block.program
+        self.nranks = len(self.endpoints)
+        self.rank = self.role_maker._worker_index()
+
+        self._init_process_group()
+
+        assert self.nranks % self.mp_degree == 0
+
+        if self.nranks > self.mp_degree:
+            # data parallelism
+            dp_degree = self.nranks // self.mp_degree
+            self._transpile_main_program(loss, dp_degree)
+        return optimize_ops, params_grads
+
+    def _transpile_main_program(self, loss, dp_degree):
+        self._insert_loss_grad_ops(loss, dp_degree)
+        self._insert_allreduce_ops(loss, self.dp_ring_id)
+
+    def _insert_loss_grad_ops(self, loss, dp_degree):
+        """
+        In order to keep the learning rate consistent in different numbers of
+        training workers, we scale the loss grad by the number of workers
+        """
+        block = loss.block
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_loss_grad_op(op):
+                loss_grad_var = block.vars[op.output_arg_names[0]]
+                block._insert_op(
+                    idx + 1,
+                    type='scale',
+                    inputs={'X': loss_grad_var},
+                    outputs={'Out': loss_grad_var},
+                    attrs={
+                        'scale': 1.0 / dp_degree,
+                        OP_ROLE_KEY: OpRole.Backward
+                    })
+                break
+
+    def _insert_allreduce_ops(self, loss, ring_id):
+        block = loss.block
+        grad = None
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if is_backward_op(op) and OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.attr(OP_ROLE_VAR_KEY)
+                if len(op_role_var) == 0:
+                    continue
+                assert len(op_role_var) % 2 == 0
+                offset = idx
+                for i in range(0, len(op_role_var), 2):
+                    param = block.vars[op_role_var[i]]
+                    grad = block.vars[op_role_var[i + 1]]
+                    if offset == idx:
+                        offset += 1
+                        block._insert_op(
+                            offset,
+                            type='c_sync_calc_stream',
+                            inputs={'X': grad},
+                            outputs={'Out': grad},
+                            attrs={OP_ROLE_KEY: OpRole.Backward})
+                        offset += 1
+
+                    block._insert_op(
+                        offset,
+                        type='c_allreduce_sum',
+                        inputs={'X': grad},
+                        outputs={'Out': grad},
+                        attrs={
+                            'ring_id': ring_id,
+                            OP_ROLE_KEY: OpRole.Backward
+                        })
+
+        if grad is None:
+            return
+
+        for idx, op in list(enumerate(block.ops)):
+            if is_optimizer_op(op):
+                block._insert_op(
+                    idx,
+                    type='c_sync_comm_stream',
+                    inputs={'X': grad},
+                    outputs={'Out': grad},
+                    attrs={'ring_id': ring_id,
+                           OP_ROLE_KEY: OpRole.Backward})
+                break
diff --git a/python/paddle/distributed/fleet/meta_parallel/__init__.py b/python/paddle/distributed/fleet/meta_parallel/__init__.py
index 7ecb97bf8234a..ed1add1f7baee 100644
--- a/python/paddle/distributed/fleet/meta_parallel/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .mp_utils import *
+from .parallel_layers import *
 from .model_parallel import ModelParallel
+from .pipeline_parallel import PipelineParallel
diff --git a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
index 5cf1242a37ad5..cdf947895b777 100644
--- a/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
+++ b/python/paddle/distributed/fleet/meta_parallel/meta_parallel_base.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 from paddle.fluid.dygraph.layers import Layer
-import logging
 
 
 class MetaParallelBase(Layer):
@@ -22,6 +21,7 @@ def __init__(self, layers, hcg, strategy):
               self).__init__(layers.full_name() + "_meta_parallel_base")
         self._layers = layers
         self._hcg = hcg
+        self._strategy = strategy
         self._prepare_for_model()
 
     def _prepare_for_model(self):
diff --git a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
index 62f5266250f60..ebf26498d9324 100644
--- a/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/model_parallel.py
@@ -15,6 +15,7 @@
 from paddle.fluid.dygraph.layers import Layer
 from .meta_parallel_base import MetaParallelBase
 from ..utils.hybrid_parallel_util import *
+from ..utils.log_util import logger
 
 
 class ModelParallel(MetaParallelBase):
@@ -22,8 +23,14 @@ def __init__(self, layers, hcg, **kwargs):
         super(ModelParallel, self).__init__(layers, hcg, **kwargs)
 
     def _prepare_for_model(self):
+        logger.info("start broadcast mp parameters")
         broadcast_mp_parameters(self._layers, self._hcg)
+
+        logger.info("start broadcast mp parameters")
         broadcast_dp_parameters(self._layers, self._hcg)
 
+        logger.info("mp's parameters is ready")
+
     def _pre_forward(self, *inputs, **kwargs):
+        logger.debug("mp start broadcast input data")
         return broadcast_input_data(self._hcg, *inputs, **kwargs)
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py b/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
deleted file mode 100644
index e32db686efd44..0000000000000
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers_help.py
+++ /dev/null
@@ -1,116 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.autograd import PyLayer
-from ...base import topology as tp
-import paddle
-
-# Follow this paper to achieve the file:
-# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter 
-# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
-
-
-def mp_reduce(x):
-    if tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size() == 1:
-        return x
-
-    paddle.distributed.all_reduce(
-        x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
-
-    return x
-
-
-def mp_split(x):
-    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
-
-    if world_size == 1:
-        return x
-
-    rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank()
-    last_dim = len(x.shape) - 1
-    input_list = paddle.split(x, num_or_sections=world_size, axis=last_dim)
-    output = input_list[rank]
-
-    return output
-
-
-def mp_gather(x):
-    world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size()
-
-    if world_size == 1:
-        return x
-
-    output = []
-    paddle.distributed.all_gather(
-        output, x, group=tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group())
-
-    output = paddle.concat(output, axis=len(x.shape) - 1)
-
-    return output
-
-
-class _IdentityInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return x
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_reduce(dx)
-
-
-class _ReduceInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_reduce(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return dx
-
-
-class _ScatterInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_split(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_gather(dx)
-
-
-class _GatherInModelParallel(PyLayer):
-    @staticmethod
-    def forward(ctx, x):
-        return mp_gather(x)
-
-    @staticmethod
-    def backward(ctx, dx):
-        return mp_split(dx)
-
-
-def identity_in_model_parallel(x):
-    return _IdentityInModelParallel.apply(x)
-
-
-def reduce_in_model_parallel(x):
-    return _ReduceInModelParallel.apply(x)
-
-
-def scatter_in_model_parallel(x):
-    return _ScatterInModelParallel.apply(x)
-
-
-def gather_in_model_parallel(x):
-    return _GatherInModelParallel.apply(x)
diff --git a/python/paddle/fluid/contrib/reader/__init__.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
similarity index 74%
rename from python/paddle/fluid/contrib/reader/__init__.py
rename to python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
index 32054d1421a27..c4ec61e84ffa5 100644
--- a/python/paddle/fluid/contrib/reader/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/__init__.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-from .distributed_reader import *
-
-__all__ = []
-__all__ += distributed_reader.__all__
+from .mp_layers import *
+from .pp_layers import *
+from .random import *
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
similarity index 86%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
rename to python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index b7512afd9a6de..b89e90128b112 100644
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -18,7 +18,6 @@
 from paddle.nn import functional as F
 from paddle import framework
 from ...base import topology as tp
-from .layers_help import identity_in_model_parallel, gather_in_model_parallel, reduce_in_model_parallel, scatter_in_model_parallel
 
 __all__ = [
     'VocabParallelEmbedding', 'ColumnParallelLinear', 'RowParallelLinear'
@@ -75,8 +74,13 @@ def forward(self, x):
         if len(origin_input_shape) == 2:
             x_shard = paddle.squeeze(x_shard, axis=-1)
 
-        emb_out_ = self.embedding(x_shard)
-        emb_out = reduce_in_model_parallel(emb_out_)
+        emb_out = self.embedding(x_shard)
+        if self.world_size > 1:
+            emb_out = paddle.distributed.collective._mp_allreduce(
+                emb_out,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
         return emb_out
 
 
@@ -123,11 +127,16 @@ def __init__(self,
             self.bias = None
 
     def forward(self, x):
-        input_parallel = identity_in_model_parallel(x)
+        # use inner api to process identity
+        input_parallel = paddle.distributed.collective._c_identity(
+            x, group=self.model_parallel_group)
         output_parallel = F.linear(
             input_parallel, self.weight, self.bias, name=self.name)
         if self.gather_output:
-            output = gather_in_model_parallel(output_parallel)
+            output = paddle.distributed.collective._c_concat(
+                output_parallel,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
         else:
             output = output_parallel
         return output
@@ -182,9 +191,18 @@ def forward(self, x):
             input_parallel = x
         else:
             # split last dim
-            input_parallel = scatter_in_model_parallel(x)
+            input_parallel = paddle.distributed.collective._c_split(
+                x,
+                rank=self.rank,
+                nranks=self.world_size,
+                group=self.model_parallel_group)
 
         output_parallel = F.linear(input_parallel, self.weight, name=self.name)
-        output_ = reduce_in_model_parallel(output_parallel)
+        output_ = paddle.distributed.collective._mp_allreduce(
+            output_parallel,
+            group=self.model_parallel_group,
+            use_calc_stream=True,
+            use_model_parallel=True)
+
         output = output_ + self.bias if self.bias is not None else output_
         return output
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
new file mode 100644
index 0000000000000..e2db689eb7674
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+from paddle.fluid.dygraph.layers import Layer
+from ...utils.log_util import logger, layer_to_str
+
+__all__ = ['LayerDesc', 'PipelineLayer']
+
+
+class SegmentLayers(object):
+    def __init__(self, layers_desc, num_parts, method="uniform"):
+        self._layers_desc = layers_desc
+        self.method = method
+        self.num_parts = num_parts
+        self.num_items = len(layers_desc)
+        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+
+    def do_segment(self):
+        if self.method == "uniform":
+            return self.uniform(self.num_items, self.num_parts)
+
+    def uniform(self, num_items, num_parts):
+        result = [0 for _ in range(num_parts + 1)]
+        part_size = math.floor(num_items / num_parts)
+        for i in range(num_parts):
+            result[i] = int(min(part_size * i, num_items))
+        result[num_parts] = num_items
+        return result
+
+
+class LayerDesc(object):
+    def __init__(self, layer_func, *inputs, **kwargs):
+        self.layer_func = layer_func
+        self.inputs = inputs
+        self.kwargs = kwargs
+
+        if not issubclass(layer_func, Layer):
+            raise TypeError(
+                "The input(layer_func) should be a derived class of Layer.")
+
+    def build_layer(self):
+        return self.layer_func(*self.inputs, **self.kwargs)
+
+    def __repr__(self):
+        return layer_to_str(self.layer_func.__name__, *self.inputs,
+                            **self.kwargs)
+
+
+class PipelineLayer(Layer):
+    def __init__(self,
+                 layers,
+                 num_stages=None,
+                 topology=None,
+                 loss_fn=None,
+                 seg_method="uniform"):
+        super(PipelineLayer, self).__init__()
+        if num_stages is None and topology is None:
+            raise ValueError("should provide num_stages or topology")
+
+        # lazy import
+        import paddle.distributed as dist
+        from paddle.distributed import fleet
+
+        self.device_id = dist.ParallelEnv().device_id
+        self.layers = layers
+        self._loss_fn = loss_fn
+        self._topo = topology
+        word_size = dist.get_world_size()
+        self.global_rank = dist.get_rank()
+
+        if self._topo:
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+            if num_stages:
+                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
+                    self._num_stages)
+        else:
+            # construct default topology
+            if word_size % num_stages != 0:
+                raise ValueError("should provide correct num_stages({}) "
+                                 "which can be divided by word_size({})".format(
+                                     num_stages, word_size))
+            dp_num = word_size // num_stages
+            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
+                                                   [dp_num, num_stages, 1])
+            self._stage_id = self._topo.get_coord(self.global_rank).pipe
+            self._num_stages = self._topo.get_dim_size("pipe")
+
+        # initialize segment
+        self._layers_desc = list(self.layers)
+        self._num_layers = len(self._layers_desc)
+        self._start_pos = 0
+        self._end_pos = self._num_layers - 1
+        self._segment_network(seg_method)
+
+        # construct layer
+        self.run_function = []
+        self._build_layer()
+        self.to(paddle.CUDAPlace(self.device_id))
+
+    def _segment_network(self, seg_method):
+        logger.info("start segment network..")
+        seg = SegmentLayers(
+            self._layers_desc, num_parts=self._num_stages, method=seg_method)
+        self.segment_parts = seg.do_segment()
+
+        self._start_pos = self.segment_parts[self._stage_id]
+        self._end_pos = self.segment_parts[self._stage_id + 1]
+
+        # print information for debug
+        for stage in range(self._num_stages):
+            start = self.segment_parts[stage]
+            end = self.segment_parts[stage + 1]
+            logger.info("stage={}, global_rank={} ,layer_number={}".format(
+                stage, self.global_rank, end - start))
+
+            for index, layer in enumerate(self._layers_desc[start:end]):
+                logger.info("{}: {}".format(index + start, str(layer)))
+
+        if self._loss_fn:
+            try:
+                logger.info("loss: {}".format(self._loss_fn.__name__))
+            except AttributeError:
+                logger.info("loss: {}".format(self._loss_fn.__class__.__name__))
+
+    def _build_layer(self):
+        start = self._start_pos
+        end = self._end_pos
+        for index, layer in enumerate(self._layers_desc[start:end]):
+            layer_index = start + index
+            if isinstance(layer, Layer):
+                self.run_function.append(layer)
+                self.add_sublayer(str(layer_index), layer)
+            elif isinstance(layer, LayerDesc):
+                model = layer.build_layer()
+                self.run_function.append(model)
+                self.add_sublayer(str(layer_index), model)
+            else:
+                self.run_function.append(layer)
+
+    def forward(self, input):
+        for layer in self.run_function:
+            input = layer(input)
+        return input
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
similarity index 100%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/random.py
rename to python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
new file mode 100644
index 0000000000000..98a82f2b79856
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -0,0 +1,427 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import time
+import copy
+import os
+
+from types import MethodType
+
+from numpy import prod
+
+import paddle
+import paddle.fluid as fluid
+from .meta_parallel_base import MetaParallelBase
+from .pp_utils.utils import get_tensor_bytes
+from .pp_utils import utils
+from .parallel_layers.pp_layers import PipelineLayer
+
+FLOAT_TYPES = [
+    paddle.float16,
+    paddle.float32,
+    paddle.float64,
+]
+
+
+class PipelineParallel(MetaParallelBase):
+    def __init__(self, layers, hcg, strategy):
+        super(PipelineParallel, self).__init__(layers, hcg, strategy)
+
+        self.use_pipe_parallel = self._hcg.get_pipe_parallel_world_size() > 1
+        self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
+        self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
+
+        self.num_caches = 0
+        self.caches = {
+            'inputs': [],
+            'labels': [],
+            'outputs': [],
+            'backward_tensors': [],
+        }
+        self.recv_cache = None
+        self.grad_tensors = None
+
+        self.meta_buffer = None
+
+        self.send_meta = True
+        self.first_gradient_send = True
+
+        self.current_loss = paddle.to_tensor(0.0)
+        self.total_loss = None
+
+    def _prepare_for_model(self):
+        self.micro_batch_size = self._strategy.pipeline_configs[
+            'micro_batch_size']
+        self.accumulate_steps = self._strategy.pipeline_configs[
+            'accumulate_steps']
+
+        self.num_stages = self._hcg.get_pipe_parallel_world_size()
+        self.stage_id = self._hcg.get_stage_id()
+        self.prev_stage_id = self.stage_id - 1
+        self.next_stage_id = self.stage_id + 1
+        self._layers = PipelineLayer(
+            layers=self._layers, num_stages=self.num_stages)
+        #TODO: init process group
+
+    def _allocate_caches(self, num_caches):
+        if self.num_caches >= num_caches:
+            return
+
+        num = num_caches - self.num_caches
+        self.num_caches = num_caches
+        for key in self.caches:
+            self.caches[key].extend([None] * num)
+
+    def train_batch(self, data_iter, optimizer):
+        self.optimizer = optimizer
+        assert fluid.framework._dygraph_tracer()._has_grad, (
+            'Please enable the generation of gradients.')
+
+        if self.stage_id == 0 or self.stage_id == self.num_stages - 1:
+            assert data_iter, (
+                "For the first and the last stage, the data_iter must be set.")
+        else:
+            assert data_iter is None, (
+                "For pipe stages other than the first and the last one, "
+                "the data_iter must be None.")
+        self.data_iter = data_iter
+        self._layers.train()
+        self.total_loss = None
+
+        minibatch_cmds = utils.TrainGenerator(self.accumulate_steps,
+                                              self.num_stages, self.stage_id)
+        self._train(minibatch_cmds)
+        return self.total_loss
+
+    def _train(self, minibatch_cmds):
+        self._allocate_caches(self.num_stages)
+        for microbatch_cmds in minibatch_cmds:
+            for cmd in microbatch_cmds:
+                if type(cmd) not in self._COMMAND_MAP:
+                    #FIXME:
+                    continue
+
+                self._apply_cmd = MethodType(self._COMMAND_MAP[type(cmd)], self)
+                self._apply_cmd(**cmd.kwargs)
+
+    def _allreduce_grads(self):
+        self._modifying_grad = True
+        assert self.use_data_parallel <= 1, ("Do not support data parallel "
+                                             "with pipeline parallel now.")
+        self._modifying_grad = False
+
+    def _get_data(self):
+        if self.use_model_parallel:
+            mp_rank = self._hcg.get_model_parallel_rank()
+        else:
+            mp_rank = 0
+
+        data = None
+
+        # mp rank 0 loads the data and broadcat it to others.
+        if mp_rank == 0:
+            data = next(self.data_iter)
+        if self.use_model_parallel:
+            data = paddle.distributed.broadcast(
+                data, group=self._hcg.get_model_parallel_group())
+        return data
+
+    def _forward(self, cache_id):
+        if isinstance(self.caches['inputs'][cache_id], tuple):
+            inputs = tuple(t.clone() for t in self.caches['inputs'][cache_id])
+        else:
+            inputs = self.caches['inputs'][cache_id].clone()
+
+        self._clear_grads(inputs)
+        outputs = self._layers.forward(inputs)
+
+        self.caches['outputs'][cache_id] = outputs
+
+        if self.stage_id == self.num_stages - 1:
+            self.current_loss = outputs
+            if isinstance(self.current_loss, paddle.Tensor):
+                if self.total_loss is None:
+                    self.total_loss = paddle.zeros_like(self.current_loss)
+                self.total_loss += self.current_loss.detach()
+            else:
+                if self.total_loss is None:
+                    self.total_loss = [
+                        paddle.zeros_like(v) for v in self.current_loss
+                    ]
+                for idx, v in enumerate(self.current_loss):
+                    self.total_loss[idx] += v.detach()
+
+    def _backward(self, cache_id):
+        assert self.optimizer is not None
+        if self.stage_id == self.num_stages - 1:
+            paddle.autograd.backward(self.current_loss)
+            return
+
+        outputs = self.caches['outputs'][cache_id]
+
+        grad_tensors = self.grad_tensors
+        if isinstance(outputs, tuple):
+            out_tensors = [t for t in outputs if t.dtype in FLOAT_TYPES]
+            assert len(out_tensors) == len(grad_tensors)
+            paddle.autograd.backward(
+                tensors=out_tensors, grad_tensors=grad_tensors)
+        else:
+            paddle.autograd.backward(
+                tensors=[outputs], grad_tensors=[grad_tensors])
+
+        self.caches['outputs'][cache_id] = None
+        grad_tensors = None
+
+    def _load_micro_batch(self, cache_id):
+        inputs = self._get_data()
+
+        if self.stage_id == 0:
+            data = None
+            if isinstance(inputs[0], paddle.Tensor):
+                data = inputs[0].clone().detach()
+                data.stop_gradient = data.dtype == paddle.float32
+            else:
+                assert isinstance(inputs[0], tuple)
+                # Assume list or tuple
+                data = []
+                for d in inputs[0]:
+                    assert isinstance(d, paddle.Tensor)
+                    d = d.clone().detach()
+                    d.stop_gradient = d.dtype == paddle.float32
+                    loaded.append(d)
+                data = tuple(data)
+            self.caches['inputs'][cache_id] = data
+
+        if self.stage_id == self.num_stages - 1:
+            label = None
+            if isinstance(inputs[1], paddle.Tensor):
+                label = inputs[1]
+            elif isinstance(data[1], tuple):
+                label = []
+                for l in inputs[1]:
+                    assert isinstance(l, paddle.Tensor)
+                    l = l.detach()
+                    label.append(l)
+                label = tuple(label)
+            self.caches['labels'][cache_id] = label
+
+    def _send_meta(self, data, peer):
+        """
+        % type (0: tensor, 1: tuple)
+        % num_tensors if type=tuple
+        foreach tensor:
+          % ndims
+          % shape
+        """
+        if isinstance(data, paddle.Tensor):
+            tensor_type = paddle.to_tensor([0])
+            paddle.distributed.send(tensor_type, peer)
+            dims = paddle.to_tensor(len(data.shape))
+            paddle.distributed.send(dims, peer)
+            shape = paddle.to_tensor(data.shape)
+            paddle.distributed.send(shape, peer)
+        elif isinstance(data, tuple):
+            tensor_type = paddle.to_tensor([1])
+            paddle.distributed.send(tensor_type, peer)
+            nums = paddle.to_tensor(len(data))
+            paddle.distributed.send(nums, peer)
+            for idx, d in enumerate(data):
+                assert isinstance(d, paddle.Tensor)
+                dims = paddle.to_tensor(len(d.shape))
+                paddle.distributed.send(dims, peer)
+                shape = paddle.to_tensor(d.shape)
+                paddle.distributed.send(shape, peer)
+
+    def _recv_meta(self, peer):
+        tensor_type = paddle.to_tensor([0])
+        paddle.distributed.recv(tensor_type, peer)
+        tensor_type = tensor_type.numpy()[0]
+
+        if tensor_type == 0:
+            dims = paddle.to_tensor([0])
+            paddle.distributed.recv(dims, peer)
+            dims = dims.numpy()[0]
+            shape = paddle.to_tensor([0] * dims)
+            paddle.distributed.recv(shape, peer)
+            shape = shape.numpy().tolist()
+            return self._allocate_buffer(
+                shape, dtype="float32", num_caches=1)[0]
+        elif tensor_type == 1:
+            num = paddle.to_tensor([0])
+            paddle.distributed.recv(num, peer)
+            num = num.numpy()[0]
+            shapes = []
+            for i in range(num):
+                dims = paddle.to_tensor([0])
+                paddle.distributed.recv(dims, peer)
+                dims = dims.numpy()[0]
+                shape = paddle.to_tensor([0] * dims)
+                paddle.distributed.recv(shape, peer)
+                shapes.append(shape.numpy().tolist())
+
+            dtypes = ["float32"] * len(shapes)
+            caches = self._allocate_buffers(shapes, dtypes, num_buffers=1)[0]
+            buffers = tuple(buffers)
+            return buffers
+
+    def _send_activations(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+
+        if self.send_meta:
+            self.send_meta = False
+            self._send_meta(outputs, self.next_stage_id)
+
+        if isinstance(outputs, paddle.Tensor):
+            paddle.distributed.send(outputs, self.next_stage_id)
+        elif isinstance(outputs, tuple):
+            for output in outputs:
+                paddle.distributed.send(output, self.next_stage_id)
+
+    def _send_gradients(self, cache_id):
+        inputs = self.caches['inputs'][cache_id]
+
+        if isinstance(inputs, paddle.Tensor):
+            assert inputs.grad is not None
+            paddle.distributed.send(
+                paddle.to_tensor(inputs.grad), self.prev_stage_id)
+        else:
+            for idx, d in enumerate(inputs):
+                # Skip tensors that will not produce a grad
+                if not d.dtype in FLOAT_TYPES:
+                    assert d.grad is None
+                    continue
+                assert d.grad is not None
+                paddle.distributed.send(d.grad, self.prev_stage_id)
+        self.caches['inputs'][cache_id] = None
+
+    def _recv_activations(self, cache_id):
+        inputs = None
+
+        # Allocate the buffer if necessary
+        if self.recv_cache is None:
+            self.recv_cache = self._recv_meta(self.prev_stage_id)
+
+        if isinstance(self.recv_cache, paddle.Tensor):
+            paddle.distributed.recv(self.recv_cache, self.prev_stage_id)
+            inputs = self.recv_cache.clone().detach()
+            inputs.stop_gradient = inputs.dtype not in FLOAT_TYPES
+        else:
+            assert isinstance(self.recv_cache, tuple)
+            inputs = [None] * len(self.recv_cache)
+            for idx, d in enumerate(self.recv_cache):
+                assert isinstance(d, paddle.Tensor)
+
+                paddle.distributed.recv(d, self.prev_stage_id)
+                inputs[idx] = d.clone().detach()
+
+            inputs = tuple(inputs)
+
+            for d in inputs:
+                d.stop_gradient = d.dtype not in FLOAT_TYPES
+
+        self.caches['inputs'][cache_id] = inputs
+
+    def _recv_gradients(self, cache_id):
+        outputs = self.caches['outputs'][cache_id]
+        if self.grad_tensors is None:
+            if isinstance(outputs, paddle.Tensor):
+                s = list(outputs.shape)
+                dtype = 'float32'
+                self.grad_tensors = self._allocate_buffer(
+                    s, dtype, num_buffers=1)[0]
+            else:
+                sizes = [
+                    list(d.shape) for d in outputs if d.dtype in FLOAT_TYPES
+                ]
+                dtypes = ['float32'] * len(sizes)
+                self.grad_tensors = self._allocate_buffers(
+                    sizes, dtypes, num_buffers=1)[0]
+
+        if isinstance(self.grad_tensors, paddle.Tensor):
+            paddle.distributed.recv(self.grad_tensors, self.next_stage_id)
+        else:
+            assert isinstance(outputs, tuple)
+            for d in self.grad_tensors:
+                paddle.distributed.recv(d, self.next_stage_id)
+
+    def _step(self, lr_kwargs=None):
+        self._modifying_grad = True
+        self.optimizer.step()
+        self.optimizer.clear_gradients()
+        self._modifying_grad = False
+
+    def _clear_grads(self, inputs):
+        if isinstance(inputs, paddle.Tensor):
+            if inputs.grad is not None:
+                inputs.clear_gradient()
+        else:
+            for d in inputs:
+                if d.grad is not None:
+                    d.clear_gradient()
+
+    def _allocate_zeros(self, shape, dtype):
+        return paddle.zeros(shape, dtype)
+
+    def _allocate_buffer(self, shape, dtype, num_buffers=-1, **kwargs):
+        buffers = []
+        if num_buffers == -1:
+            num_buffers = self.num_caches
+        for count in range(num_buffers):
+            buffers.append(self._allocate_zeros(shape, dtype))
+        return buffers
+
+    def _allocate_buffers(self, shapes, dtypes, num_buffers=-1):
+        buffers = []
+        if num_buffers == -1:
+            num_buffers = self.num_caches
+        for count in range(num_buffers):
+            buffer = []
+            for shape, dtype in zip(shapes, dtypes):
+                buffer.append(
+                    self._allocate_zeros(
+                        shape, dtype, requires_grad=requires_grad))
+            buffers.append(buffer)
+        return buffers
+
+    def save_state_dict(self, model_path):
+        state_dict = self._layers.state_dict()
+        paddle.save(state_dict, model_path)
+
+    def load_state_dict(self, model_path):
+        state_dict = paddle.load(self.model_path)
+        self._layers.set_state_dict(state_dict)
+
+    _COMMAND_MAP = {
+        utils.Optimize: _step,
+        #utils.ReduceGrads: _allreduce_grads,
+        utils.Forward: _forward,
+        utils.Backward: _backward,
+    }
+
+    def _pre_forward(self, *inputs, **kwargs):
+        pass
+
+    def forward(self, *inputs, **kwargs):
+        raise RuntimeError("Call train_batch for pipeline instead of forward.")
+
+    def _post_forward(self, output):
+        pass
+
+    def _pre_backward(self, loss):
+        pass
+
+    def backward_impl(self, loss, parameters):
+        pass
+
+    def _post_backward(self, loss):
+        pass
diff --git a/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
similarity index 93%
rename from python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
rename to python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
index a7da28700bceb..d39e6760a3865 100644
--- a/python/paddle/distributed/fleet/meta_parallel/mp_utils/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
@@ -12,5 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .layers import *
-from .random import *
+from .utils import *
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
new file mode 100644
index 0000000000000..56eef8d7d21df
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import abc
+import paddle
+from ...utils import hybrid_parallel_util as hp_util
+
+__all__ = ['get_tensor_bytes', ]
+
+
+def get_tensor_bytes(tensor):
+    """Get the bytes a tensor occupied."""
+    elem_size = None
+    if tensor.dtype == paddle.float32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int64:
+        elem_size = 8
+    elif tensor.dtype == paddle.int32:
+        elem_size = 4
+    elif tensor.dtype == paddle.float16:
+        elem_size = 2
+    elif tensor.dtype == paddle.int8:
+        elem_size = 1
+    else:
+        raise ValueError("unknown data type: {}".format(tensor.dtype))
+    return tensor.numel() * elem_size
+
+
+class Generator():
+    def __init__(self, micro_batches, stages, stage_id):
+        __metaclass__ = abc.ABCMeta
+
+        self.micro_batches = micro_batches
+        self.stages = stages
+        self.stage_id = stage_id
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+        assert self.micro_batches >= self.stages, (
+            "micro_batches {} "
+            "must be greater than or equal to {}".format(self.micro_batches,
+                                                         self.stages))
+
+    @abc.abstractmethod
+    def generate(self):
+        pass
+
+    def __iter__(self):
+        self.iter = None
+        return self
+
+    def __next__(self):
+        if self.iter is None:
+            self.iter = self.generate()
+        return next(self.iter)
+
+
+class TrainGenerator(Generator):
+    def generate(self):
+        startup_steps = self.stages - self.stage_id - 1
+        cmds = []
+        forward_steps = 0
+        backward_steps = 0
+        while (forward_steps < startup_steps):
+            cmds.append(Forward)
+            forward_steps += 1
+        while (forward_steps < self.micro_batches):
+            cmds.append(Forward)
+            forward_steps += 1
+            cmds.append(Backward)
+            backward_steps += 1
+        while (backward_steps < self.micro_batches):
+            cmds.append(Backward)
+            backward_steps += 1
+        cmds.append(Optimize)
+        yield cmds
+
+
+class Command:
+    def __init__(self, **kwargs):
+        self.name = self.__class__.__name__
+        self.kwargs = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+    def __repr__(self):
+        return hp_util.call_to_str(self.name, **self.kwargs)
+
+
+class Optimize(Command):
+    pass
+
+
+class Forward(Command):
+    pass
+
+
+class Backward(Command):
+    pass
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 774e8db0df52c..0a47750ead7ec 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -14,3 +14,4 @@
 
 from .fs import LocalFS, HDFSClient
 from .ps_util import DistributedInfer
+from .recompute import recompute
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index a866d5be64891..de2d3f45ba033 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -19,8 +19,9 @@
 from paddle import framework
 import paddle
 from paddle.fluid import core
-from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, construct_groups
+from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
 from collections import OrderedDict
+from .log_util import logger
 
 
 def _apply_collective_grads(parameters, comm_group):
@@ -37,7 +38,7 @@ def _apply_collective_grads(parameters, comm_group):
             assert g_var not in grad_var_set
             grad_var_set.add(g_var)
 
-    coalesced_grads_and_vars = construct_groups(grad_vars, 128 * 1024 * 1024)
+    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
 
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
@@ -47,32 +48,45 @@ def _apply_collective_grads(parameters, comm_group):
     _split_tensors(coalesced_grads_and_vars)
 
 
-def broadcast_input_data(hcg, *inputs, **kwargs):
+def _broadcast_data_help(data, shape, dtype, hcg):
     model_parallel_group = hcg.get_model_parallel_group()
     src_rank = hcg.get_model_parallel_group_src_rank()
+    mp_rank = hcg.get_model_parallel_rank()
+
+    shape_gpu = paddle.to_tensor(shape, dtype="int32")
+    paddle.distributed.broadcast(
+        shape_gpu,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
+
+    if mp_rank != 0:
+        input_data = paddle.zeros(shape_gpu, dtype=dtype)
+    else:
+        input_data = data
+
+    paddle.distributed.broadcast(
+        input_data,
+        src=src_rank,
+        group=model_parallel_group,
+        use_calc_stream=True)
 
-    for input_ in inputs:
-        if isinstance(input_, core.VarBase):
+
+def broadcast_input_data(hcg, *inputs, **kwargs):
+    for v in inputs:
+        if isinstance(v, core.VarBase):
             with framework.no_grad():
-                paddle.distributed.broadcast(
-                    input_,
-                    src=src_rank,
-                    group=model_parallel_group,
-                    use_calc_stream=True)
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
         else:
-            print("it doesn't support data type {}".format(type(input_)))
+            logger.error("it doesn't support data type {}".format(type(v)))
 
     for k, v in kwargs.items():
         if isinstance(v, core.VarBase):
             with framework.no_grad():
-                paddle.distributed.broadcast(
-                    v,
-                    src=src_rank,
-                    group=model_parallel_group,
-                    use_calc_stream=True)
+                _broadcast_data_help(v, v.shape, v.dtype, hcg)
             kwargs[k] = v
         else:
-            print("it doesn't support data type {}".format(type(v)))
+            logger.error("it doesn't support data type {}".format(type(v)))
     return inputs, kwargs
 
 
@@ -92,5 +106,6 @@ def broadcast_dp_parameters(model, hcg):
 
 def fused_allreduce_gradients(parameter_list, hcg):
     data_parallel_group = hcg.get_data_parallel_group()
+    logger.debug("dp start fuse allreduce gradients")
     with framework.no_grad():
         _apply_collective_grads(parameter_list, data_parallel_group)
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
new file mode 100644
index 0000000000000..12c0bf699c1e6
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import sys
+
+
+class LoggerFactory:
+    @staticmethod
+    def build_logger(name=None, level=logging.INFO):
+        assert name is not None, "name for logger should not be None"
+
+        formatter = logging.Formatter(
+            "%(asctime)s-%(levelname)s: "
+            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+
+        _logger = logging.getLogger(name)
+        _logger.setLevel(level)
+        _logger.propagate = False
+        handler = logging.StreamHandler(stream=sys.stderr)
+        handler.setFormatter(formatter)
+        handler.setLevel(level)
+        _logger.addHandler(handler)
+        return _logger
+
+
+logger = LoggerFactory.build_logger(name="HybridParallel", level=logging.INFO)
+
+
+def layer_to_str(base, *args, **kwargs):
+    name = base + "("
+    if args:
+        name += ", ".join(str(arg) for arg in args)
+        if kwargs:
+            name += ", "
+    if kwargs:
+        name += ", ".join("{}={}".format(key, str(value))
+                          for key, value in kwargs.items())
+    name += ")"
+    return name
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
new file mode 100644
index 0000000000000..0dc305ec77d51
--- /dev/null
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.fluid import core
+from paddle.autograd import PyLayer
+from paddle.fluid import framework
+import contextlib
+
+import logging
+logging.basicConfig(
+    format='%(asctime)s %(levelname)-8s %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S')
+
+
+def detach_variable(inputs):
+    out = []
+    for inp in inputs:
+        if not isinstance(inp, core.VarBase):
+            out.append(inp)
+            continue
+
+        x = inp.detach()
+        x.stop_gradient = inp.stop_gradient
+        out.append(x)
+    return tuple(out)
+
+
+def check_recompute_necessary(inputs):
+    if not any(input_.stop_gradient == False for input_ in inputs
+               if isinstance(input_, paddle.Tensor)):
+        logging.warn(
+            "[Recompute]: None of the inputs to current recompute block need grad, "
+            "therefore there is NO need to recompute this block in backward !")
+
+
+@contextlib.contextmanager
+def swith_rng_state(rng_state):
+    orig_cuda_rng_state = paddle.get_cuda_rng_state()
+    paddle.set_cuda_rng_state(rng_state)
+    try:
+        yield
+    finally:
+        paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+class RecomputeFunction(PyLayer):
+    @staticmethod
+    def forward(ctx, run_function, preserve_rng_state, *args):
+        check_recompute_necessary(args)
+
+        # store for recomputing 
+        ctx.run_function = run_function
+        ctx.preserve_rng_state = preserve_rng_state
+
+        # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input
+        # the order of tensors in backward()'s output should be the same as tensors in forward()'s input
+        # None tensor inputs will be filtered in backward inputs.
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        tensor_inputs = []
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+        ctx.save_for_backward(*tensor_inputs)
+
+        # NOTE recompute with restore RNG only support one senario where one process for one cuda gpu.
+        # one process with multiple gpu and mix-gpu-cpu senarios are not support
+        if ctx.preserve_rng_state:
+            cur_device = paddle.get_device()
+            if 'gpu:' not in cur_device:
+                raise RuntimeError(
+                    "Recompute with RNG perserve is not support current device: {}.".
+                    format(cur_device))
+            ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
+
+        # TODO support AMP
+
+        with paddle.no_grad():
+            outputs = run_function(*args)
+
+        return outputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # TODO need to check the recompute calling is vaild or not
+
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensors = ctx.saved_tensor()
+            for i, idx in enumerate(tensor_indices):
+                inputs[idx] = tensors[i]
+
+            # paddle.enable_grad()
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # TODO support AMP
+
+            if ctx.preserve_rng_state:
+                with swith_rng_state(ctx.fw_cuda_rng_state):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs)
+            else:
+                detached_inputs = detach_variable(tuple(inputs))
+                outputs = ctx.run_function(*detached_inputs)
+
+            if isinstance(outputs, core.VarBase):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            # run backward() with only tensor that requires grad
+            forward_outputs_with_grad = []
+            backward_inputs = list(args)
+            for i in range(len(outputs)):
+                if isinstance(outputs[i],
+                              core.VarBase) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has requires_grad=True, this recompute() is not necessary"
+                )
+
+            assert len(backward_inputs) == len(
+                forward_outputs_with_grad
+            ), "number of forward outputs is [{}], but the backward got [{}] inputs".format(
+                len(forward_outputs_with_grad), len(backward_inputs))
+
+            # actually backward            
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+
+            grads = list(inp._grad_ivar() for inp in detached_inputs
+                         if isinstance(inp, core.VarBase))
+
+            return grads
+
+
+def recompute(function, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory.
+
+    Args:
+        function: layer of sequence of layers that describes part of forward pass of the model whose 
+        intermediate activations will be released to save memory in forward stage and will be recomputed 
+        in backward stage for gradient calculation.
+        preserve_rng_state(bool, optional):  if preserve the RNG state of forward and restore it in backward. 
+        args: inputs to the function
+
+    Returns:
+        Output of function on args
+    """
+    # Hack to mix *args with **kwargs in a python 2.7-compliant way
+    preserve = kwargs.pop('preserve_rng_state', True)
+    if kwargs:
+        raise ValueError("Unexpected keyword arguments: " + ",".join(
+            arg for arg in kwargs))
+
+    return RecomputeFunction.apply(function, preserve, *args)
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 439b5c64615c0..3b73034dfde2e 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -72,7 +72,6 @@
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
 from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
 from .incubate import fleet
-from .incubate import data_generator
 from .transpiler import DistributeTranspiler, \
     memory_optimize, release_memory, DistributeTranspilerConfig
 from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py
index df41e649ca8cb..30981f531289a 100644
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -22,11 +22,7 @@
 from .op_frequence import *
 from . import quantize
 from .quantize import *
-from . import reader
-from .reader import *
 from . import slim
-from . import utils
-from .utils import *
 from . import extend_optimizer
 from .extend_optimizer import *
 from . import model_stat
@@ -42,8 +38,6 @@
 __all__ += memory_usage_calc.__all__
 __all__ += op_frequence.__all__
 __all__ += quantize.__all__
-__all__ += reader.__all__
-__all__ += utils.__all__
 __all__ += extend_optimizer.__all__
 __all__ += ['mixed_precision']
 __all__ += layers.__all__
diff --git a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
index 3bfc078971d7a..588eb2a29f555 100644
--- a/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
+++ b/python/paddle/fluid/contrib/mixed_precision/amp_nn.py
@@ -20,7 +20,7 @@
 __all__ = ['check_finite_and_unscale', 'update_loss_scaling']
 
 
-def check_finite_and_unscale(x, scale, name=None):
+def check_finite_and_unscale(x, scale, name=None, float_status=None):
     """
     Check if input X contains all finite data, if yes, scale it by input Scale.
 
@@ -30,9 +30,11 @@ def check_finite_and_unscale(x, scale, name=None):
     FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
     Out should not be used, and its data may not be deterministic. 
     Otherwise, FoundInfinite will be 0 (False).
+
     Args:
         x(list|tuple): The input tensors of check_finite_and_unscale operator.
         scale: The scale of check_finite_and_unscale operator.
+        float_status(Tensor): (Only used on NPU) The float status to check overflow.
     """
     check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
     for e in x:
@@ -43,6 +45,11 @@ def check_finite_and_unscale(x, scale, name=None):
     found_inf = helper.create_variable_for_type_inference(dtype='bool')
 
     inputs = {'X': x, 'Scale': scale}
+    if core.is_compiled_with_npu():
+        check_variable_and_dtype(float_status, "float_status",
+                                 ['float16', 'float32'],
+                                 'check_finite_and_unscale')
+        inputs['FloatStatus'] = float_status
     outputs = {'Out': x, 'FoundInfinite': found_inf}
     helper.append_op(
         type='check_finite_and_unscale', inputs=inputs, outputs=outputs)
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index 724f707c2e1f0..3cb9fe75559b1 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -29,6 +29,7 @@
 from .amp_nn import update_loss_scaling
 import types
 import warnings
+import paddle
 
 __all__ = ["decorate"]
 
@@ -98,7 +99,7 @@ def _set_distributed(self, flag):
     def get_loss_scaling(self):
         """Return the real-time loss scaling factor.
         """
-        assert self._loss_scaling is not None, 'Call minimize() before calling get_loss_scaling()'
+        assert self._loss_scaling is not None, 'Please call minimize() before calling get_loss_scaling().'
         return self._loss_scaling
 
     def get_scaled_loss(self):
@@ -165,6 +166,17 @@ def backward(self,
         train_program = loss.block.program
         self._train_program = train_program
 
+        # NOTE(zhiqiu): _float_status is only used for NPU.
+        if core.is_compiled_with_npu():
+            float_status = paddle.static.data(
+                name="float_status", shape=[8], dtype='float32')
+            self._train_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            self._float_status = float_status
+        else:
+            self._float_status = None
+
         with program_guard(self._train_program, startup_program):
             self._init_amp_var()
 
@@ -294,7 +306,10 @@ def apply_gradients(self, params_grads):
             for p, g in params_grads:
                 with self._train_program._optimized_guard([p, g]):
                     _, found_inf = check_finite_and_unscale(
-                        [g, ], self._loss_scaling, name="find_infinite_scale")
+                        [g, ],
+                        self._loss_scaling,
+                        name="find_infinite_scale",
+                        float_status=self._float_status)
                     found_infs.append(found_inf)
         elif self._use_pure_fp16:
             if fp32_grads:
@@ -302,19 +317,24 @@ def apply_gradients(self, params_grads):
                     _, fp32_found_inf = check_finite_and_unscale(
                         fp32_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp32")
+                        name="find_infinite_scale_fp32",
+                        float_status=self._float_status)
                 found_infs.append(fp32_found_inf)
             if fp16_grads:
                 with self._train_program._optimized_guard(fp16_grads):
                     _, fp16_found_inf = check_finite_and_unscale(
                         fp16_grads,
                         self._loss_scaling,
-                        name="find_infinite_scale_fp16")
+                        name="find_infinite_scale_fp16",
+                        float_status=self._float_status)
                 found_infs.append(fp16_found_inf)
         else:
             with self._train_program._optimized_guard(grads):
                 _, found_inf = check_finite_and_unscale(
-                    grads, self._loss_scaling, name="find_infinite_scale")
+                    grads,
+                    self._loss_scaling,
+                    name="find_infinite_scale",
+                    float_status=self._float_status)
 
         if self._use_dynamic_loss_scaling:
             if self._is_distributed or self._use_pure_fp16:
@@ -394,6 +414,7 @@ def minimize(self,
             The scaled loss by scaling factor, the list of optimize ops, and a
             list of scaled parameters and gradients.
         """
+
         opt_dict = self._optimizer.__class__.__dict__
         if 'minimize' in opt_dict and isinstance(opt_dict['minimize'],
                                                  types.FunctionType):
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index cf35963fe2b26..65b62e7e5ab55 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -103,7 +103,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype):
             if in_name not in {'X', 'Z'}:
                 continue
         for in_var_name in op.input(in_name):
-            in_var = block.var(in_var_name)
+            in_var = block._find_var_recursive(in_var_name)
             if in_var.type not in _valid_types or in_var.dtype == dest_dtype:
                 continue
             if in_var.dtype == src_dtype:
diff --git a/python/paddle/fluid/contrib/reader/README.md b/python/paddle/fluid/contrib/reader/README.md
deleted file mode 100644
index f043a17493ec2..0000000000000
--- a/python/paddle/fluid/contrib/reader/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-## CTR READER
-
-An multi-thread cpp reader that has the same interface with py_reader. It
-uses cpp multi-thread to read file and is much more faster then the Python read
-thread in py_reader.
-
-Currently, it support two types of file:
- - gzip
- - plain text file
-
-and two types of data format:
- - cvs data format is :
-   * label dense_fea,dense_fea sparse_fea,sparse_fea
- - the svm data format is :
-   * label slot1:fea_sign slot2:fea_sign slot1:fea_sign
-
-## Distributed reader
-
-The distributed reader is mainly used by multi-process tasks, and the input must be a batch reader.
-
-Cons:
-  - It can be operated conveniently so that different processes can read different data.
-
-Pros:
-  - If batch_reader produces training data, and batch_reader loads or preprocesses data for a long time, this data reading method may be slower.
diff --git a/python/paddle/fluid/contrib/reader/distributed_reader.py b/python/paddle/fluid/contrib/reader/distributed_reader.py
deleted file mode 100644
index ecee769218f54..0000000000000
--- a/python/paddle/fluid/contrib/reader/distributed_reader.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import os
-
-__all__ = ["distributed_batch_reader"]
-
-
-def distributed_batch_reader(batch_reader):
-    """
-    Create a reader for multi-process training. The input must be a batch reader.
-
-    Args:
-        batch_reader (callable): The input reader should be a batch reader.
-
-    Examples:
-
-    .. code-block:: python
-           import paddle
-           import paddle.fluid as fluid
-
-           train_reader = paddle.batch(paddle.dataset.mnist.train(),
-                    batch_size=32,drop_last=True)
-           train_reader = fluid.contrib.reader.distributed_batch_reader(
-                    train_reader)
-
-    """
-    trainers_num = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
-    assert trainer_id < trainers_num
-
-    def decorate_for_multi_process():
-        if trainers_num > 1:
-            print("start data reader (trainers_num: {}, trainer_id: {})".format(
-                trainers_num, trainer_id))
-
-        train_data, idx = None, 1
-        for batch_id, data in enumerate(batch_reader()):
-            if trainers_num > 1:
-                if idx < trainers_num:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    idx += 1
-                else:
-                    if idx == trainer_id + 1:
-                        train_data = data
-                    assert train_data is not None, "train data should not be None."
-                    yield train_data
-                    train_data, idx = None, 1
-            else:
-                yield data
-
-    return decorate_for_multi_process
diff --git a/python/paddle/fluid/contrib/tests/test_distributed_reader.py b/python/paddle/fluid/contrib/tests/test_distributed_reader.py
deleted file mode 100644
index b964168eb3a2f..0000000000000
--- a/python/paddle/fluid/contrib/tests/test_distributed_reader.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import numpy as np
-import paddle.fluid as fluid
-import os
-
-
-def data_generator():
-    data = [0, 1, 2, 3]
-    for val in data:
-        yield val
-
-
-class TestDistributedReader(unittest.TestCase):
-    def test_distributed_reader(self):
-        trainer_num = 4
-        os.environ['PADDLE_TRAINER_ID'] = str(1)
-        os.environ['PADDLE_TRAINERS_NUM'] = str(trainer_num)
-
-        reader = fluid.contrib.reader.distributed_batch_reader(data_generator)
-        data = next(reader())
-        assert data == 1
-
-        #Note: windows python3 don't have unsetenv
-        del os.environ['PADDLE_TRAINER_ID']
-        del os.environ['PADDLE_TRAINERS_NUM']
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py
deleted file mode 100644
index 9572552f0f2be..0000000000000
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ /dev/null
@@ -1,603 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""hdfs_utils.py will move to fluid/incubate/fleet/utils/hdfs.py"""
-
-import os
-import sys
-import subprocess
-import multiprocessing
-from datetime import datetime
-
-import re
-import copy
-import errno
-
-import logging
-from paddle.fluid.log_helper import get_logger
-
-__all__ = ["HDFSClient", "multi_download", "multi_upload"]
-
-_logger = get_logger(
-    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-
-class HDFSClient(object):
-    r"""
-    A tool of HDFS 
-
-    Args:
-        hadoop_home (string): hadoop_home 
-        configs (dict): hadoop config, it is a dict, please contain \
-            key "fs.default.name" and "hadoop.job.ugi"
-        Can be a float value
-    Examples:
-        hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-        configs = {
-            "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-            "hadoop.job.ugi": "hello,hello123"
-        }
-
-        client = HDFSClient(hadoop_home, configs)
-
-        client.ls("/user/com/train-25")
-        files = client.lsr("/user/com/train-25/models")
-    """
-
-    def __init__(self, hadoop_home, configs):
-        self.pre_commands = []
-        hadoop_bin = '%s/bin/hadoop' % hadoop_home
-        self.pre_commands.append(hadoop_bin)
-        dfs = 'fs'
-        self.pre_commands.append(dfs)
-
-        for k, v in configs.items():
-            config_command = '-D%s=%s' % (k, v)
-            self.pre_commands.append(config_command)
-
-    def __run_hdfs_cmd(self, commands, retry_times=5):
-        whole_commands = copy.deepcopy(self.pre_commands)
-        whole_commands.extend(commands)
-
-        print('Running system command: {0}'.format(' '.join(whole_commands)))
-
-        ret_code = 0
-        ret_out = None
-        ret_err = None
-        whole_commands = " ".join(whole_commands)
-        for x in range(retry_times + 1):
-            proc = subprocess.Popen(
-                whole_commands,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                shell=True)
-            (output, errors) = proc.communicate()
-            ret_code, ret_out, ret_err = proc.returncode, output, errors
-            if ret_code:
-                _logger.warn(
-                    'Times: %d, Error running command: %s. Return code: %d, Error: %s'
-                    % (x, ' '.join(whole_commands), proc.returncode, errors))
-            else:
-                break
-        return ret_code, ret_out, ret_err
-
-    def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5):
-        """
-        upload the local file to hdfs
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            retry_times(int|5): retry times
-
-        Returns:
-                True or False
-        """
-        assert hdfs_path is not None
-        assert local_path is not None and os.path.exists(local_path)
-
-        if os.path.isdir(local_path):
-            _logger.warn(
-                "The Local path: {} is dir and I will support it later, return".
-                format(local_path))
-            return False
-
-        base = os.path.basename(local_path)
-        if not self.is_exist(hdfs_path):
-            self.makedirs(hdfs_path)
-        else:
-            if self.is_exist(os.path.join(hdfs_path, base)):
-                if overwrite:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is True, delete it".
-                        format(hdfs_path))
-                    self.delete(hdfs_path)
-                else:
-                    _logger.error(
-                        "The HDFS path: {} is exist and overwrite is False, return".
-                        format(hdfs_path))
-                    return False
-
-        put_commands = ["-put", local_path, hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(put_commands,
-                                                         retry_times)
-        if returncode:
-            _logger.error("Put local path: {} to HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Put local path: {} to HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def download(self, hdfs_path, local_path, overwrite=False, unzip=False):
-        """
-        download file from HDFS
-
-        Args:
-            hdfs_path(str): the hdfs file path
-            local_path(str): the local file path
-            overwrite(bool|None): will overwrite the file on HDFS or not
-            unzip(bool|False): if the download file is compressed by zip, unzip it or not.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Downloading %r to %r.', hdfs_path, local_path)
-        _logger.info('Download of %s to %r complete.', hdfs_path, local_path)
-
-        if not self.is_exist(hdfs_path):
-            print("HDFS path: {} do not exist".format(hdfs_path))
-            return False
-        if self.is_dir(hdfs_path):
-            _logger.error(
-                "The HDFS path: {} is dir and I will support it later, return".
-                format(hdfs_path))
-
-        if os.path.exists(local_path):
-            base = os.path.basename(hdfs_path)
-            local_file = os.path.join(local_path, base)
-            if os.path.exists(local_file):
-                if overwrite:
-                    os.remove(local_file)
-                else:
-                    _logger.error(
-                        "The Local path: {} is exist and overwrite is False, return".
-                        format(local_file))
-                    return False
-
-        self.make_local_dirs(local_path)
-
-        download_commands = ["-get", hdfs_path, local_path]
-        returncode, output, errors = self.__run_hdfs_cmd(download_commands)
-        if returncode:
-            _logger.error("Get local path: {} from HDFS path: {} failed".format(
-                local_path, hdfs_path))
-            return False
-        else:
-            _logger.info("Get local path: {} from HDFS path: {} successfully".
-                         format(local_path, hdfs_path))
-            return True
-
-    def is_exist(self, hdfs_path=None):
-        """
-        whether the remote HDFS path exists
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-        exist_cmd = ['-test', '-e', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            exist_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS is_exist HDFS path: {} failed".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS is_exist HDFS path: {} successfully".format(
-                hdfs_path))
-            return True
-
-    def is_dir(self, hdfs_path=None):
-        """
-        whether the remote HDFS path is directory
-
-        Args:
-            hdfs_path(str): the hdfs file path
-
-        Returns:
-            True or False
-        """
-
-        if not self.is_exist(hdfs_path):
-            return False
-
-        dir_cmd = ['-test', '-d', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS path: {} failed is not a directory".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} successfully is a directory".format(
-                hdfs_path))
-            return True
-
-    def delete(self, hdfs_path):
-        """
-        Remove a file or directory from HDFS.
-
-        whether the remote HDFS path exists
-
-        Args:
-        hdfs_path: HDFS path.
-
-        Returns:
-            True or False
-            This function returns `True` if the deletion was successful and `False` if
-            no file or directory previously existed at `hdfs_path`.
-        """
-        _logger.info('Deleting %r.', hdfs_path)
-
-        if not self.is_exist(hdfs_path):
-            _logger.warn("HDFS path: {} do not exist".format(hdfs_path))
-            return True
-
-        if self.is_dir(hdfs_path):
-            del_cmd = ['-rmr', hdfs_path]
-        else:
-            del_cmd = ['-rm', hdfs_path]
-
-        returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0)
-
-        if returncode:
-            _logger.error("HDFS path: {} delete files failure".format(
-                hdfs_path))
-            return False
-        else:
-            _logger.info("HDFS path: {} delete files successfully".format(
-                hdfs_path))
-            return True
-
-    def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False):
-        """
-        Move a file or folder on HDFS.
-
-        Args:
-        hdfs_path(str): HDFS path.
-        overwrite(bool|False): If the path already exists and overwrite is False, will return False.
-
-        Returns:
-            True or False
-        """
-        assert hdfs_src_path is not None
-        assert hdfs_dst_path is not None
-
-        if not self.is_exist(hdfs_src_path):
-            _logger.info("HDFS path do not exist: {}".format(hdfs_src_path))
-        if self.is_exist(hdfs_dst_path) and not overwrite:
-            _logger.error("HDFS path is exist: {} and overwrite=False".format(
-                hdfs_dst_path))
-
-        rename_command = ['-mv', hdfs_src_path, hdfs_dst_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            rename_command, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS rename path: {} to {} failed".format(
-                hdfs_src_path, hdfs_dst_path))
-            return False
-        else:
-            _logger.info("HDFS rename path: {} to {} successfully".format(
-                hdfs_src_path, hdfs_dst_path))
-            return True
-
-    @staticmethod
-    def make_local_dirs(local_path):
-        """
-        create a directory local, is same to mkdir
-        Args:
-            local_path: local path that wants to create a directory.
-        """
-        try:
-            os.makedirs(local_path)
-        except OSError as e:
-            if e.errno != errno.EEXIST:
-                raise
-
-    def makedirs(self, hdfs_path):
-        """
-        Create a remote directory, recursively if necessary.
-
-        Args:
-        hdfs_path(str): Remote path. Intermediate directories will be created appropriately.
-
-        Returns:
-            True or False
-        """
-        _logger.info('Creating directories to %r.', hdfs_path)
-        assert hdfs_path is not None
-
-        if self.is_exist(hdfs_path):
-            _logger.error("HDFS path is exist: {}".format(hdfs_path))
-            return
-
-        mkdirs_commands = ['-mkdir', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            mkdirs_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS mkdir path: {} failed".format(hdfs_path))
-            return False
-        else:
-            _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path))
-            return True
-
-    def ls(self, hdfs_path):
-        """
-        ls directory contents about HDFS hdfs_path
-
-        Args:
-        hdfs_path(str): Remote HDFS path will be ls.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-ls', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list path: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list path: {} successfully".format(hdfs_path))
-
-            ret_lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    ret_lines.append(re_line[7])
-            return ret_lines
-
-    def lsr(self, hdfs_path, only_file=True, sort=True):
-        """
-        list directory contents about HDFS hdfs_path recursively
-
-        Args:
-        hdfs_path(str): Remote HDFS path.
-        only_file(bool|True): will discard folders.
-        sort(bool|True): will be sorted by create time.
-
-        Returns:
-            List: a contents list about hdfs_path.
-        """
-
-        def sort_by_time(v1, v2):
-            v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M')
-            v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M')
-            return v1_time > v2_time
-
-        assert hdfs_path is not None
-
-        if not self.is_exist(hdfs_path):
-            return []
-
-        ls_commands = ['-lsr', hdfs_path]
-        returncode, output, errors = self.__run_hdfs_cmd(
-            ls_commands, retry_times=1)
-
-        if returncode:
-            _logger.error("HDFS list all files: {} failed".format(hdfs_path))
-            return []
-        else:
-            _logger.info("HDFS list all files: {} successfully".format(
-                hdfs_path))
-            lines = []
-            regex = re.compile(r'\s+')
-            out_lines = output.strip().split("\n")
-            for line in out_lines:
-                re_line = regex.split(line)
-                if len(re_line) == 8:
-                    if only_file and re_line[0][0] == "d":
-                        continue
-                    else:
-                        lines.append(
-                            (re_line[7], re_line[5] + " " + re_line[6]))
-            if sort:
-                sorted(lines, cmp=sort_by_time)
-            ret_lines = [ret[0] for ret in lines]
-            return ret_lines
-
-
-def multi_download(client,
-                   hdfs_path,
-                   local_path,
-                   trainer_id,
-                   trainers,
-                   multi_processes=5):
-    """
-    Download files from HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        trainer_id(int): current trainer id
-        trainers(int): all trainers number
-        multi_processes(int|5): the download data process at the same time, default=5
-
-    Returns:
-        List:
-        Download files in local folder.
-    """
-
-    def __subprocess_download(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-            if re_path == os.curdir:
-                sub_local_re_path = local_path
-            else:
-                sub_local_re_path = os.path.join(local_path, re_path)
-            client.download(data, sub_local_re_path)
-
-    assert isinstance(client, HDFSClient)
-
-    client.make_local_dirs(local_path)
-    _logger.info("Make local dir {} successfully".format(local_path))
-
-    all_need_download = client.lsr(hdfs_path, sort=True)
-    need_download = all_need_download[trainer_id::trainers]
-    _logger.info("Get {} files From all {} files need to be download from {}".
-                 format(len(need_download), len(all_need_download), hdfs_path))
-
-    _logger.info("Start {} multi process to download datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = need_download[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_download, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to download datas".format(
-        multi_processes))
-
-    local_downloads = []
-    for data in need_download:
-        data_name = os.path.basename(data)
-        re_path = os.path.relpath(os.path.dirname(data), hdfs_path)
-        if re_path == os.curdir:
-            local_re_path = os.path.join(local_path, data_name)
-        else:
-            local_re_path = os.path.join(local_path, re_path, data_name)
-        local_downloads.append(local_re_path)
-
-    return local_downloads
-
-
-def getfilelist(path):
-    rlist = []
-    for dir, folder, file in os.walk(path):
-        for i in file:
-            t = os.path.join(dir, i)
-            rlist.append(t)
-    for r in rlist:
-        print(r)
-
-
-def multi_upload(client,
-                 hdfs_path,
-                 local_path,
-                 multi_processes=5,
-                 overwrite=False,
-                 sync=True):
-    """
-    Upload files to HDFS using multi process.
-
-    Args:
-        client(HDFSClient): instance of HDFSClient
-        hdfs_path(str): path on hdfs
-        local_path(str): path on local
-        multi_processes(int|5): the upload data process at the same time, default=5
-        overwrite(bool|False): will overwrite file on HDFS or not
-        sync(bool|True): upload files sync or not.
-
-    Returns:
-        None
-    """
-
-    def __subprocess_upload(datas):
-        for data in datas:
-            re_path = os.path.relpath(os.path.dirname(data), local_path)
-            hdfs_re_path = os.path.join(hdfs_path, re_path)
-            client.upload(hdfs_re_path, data, overwrite, retry_times=5)
-
-    def get_local_files(path):
-        rlist = []
-
-        if not os.path.isdir(path):
-            return rlist
-
-        for dirname, folder, files in os.walk(path):
-            for i in files:
-                t = os.path.join(dirname, i)
-                rlist.append(t)
-        return rlist
-
-    assert isinstance(client, HDFSClient)
-
-    all_files = get_local_files(local_path)
-    if not all_files:
-        _logger.info("there are nothing need to upload, exit")
-        return
-    _logger.info("Start {} multi process to upload datas".format(
-        multi_processes))
-    procs = []
-    for i in range(multi_processes):
-        process_datas = all_files[i::multi_processes]
-        p = multiprocessing.Process(
-            target=__subprocess_upload, args=(process_datas, ))
-        procs.append(p)
-        p.start()
-
-    # complete the processes
-    for proc in procs:
-        proc.join()
-
-    _logger.info("Finish {} multi process to upload datas".format(
-        multi_processes))
-
-
-if __name__ == "__main__":
-    hadoop_home = "/home/client/hadoop-client/hadoop/"
-
-    configs = {
-        "fs.default.name": "hdfs://xxx.hadoop.com:54310",
-        "hadoop.job.ugi": "hello,hello123"
-    }
-
-    client = HDFSClient(hadoop_home, configs)
-
-    client.ls("/user/com/train-25")
-    files = client.lsr("/user/com/train-25/models")
-
-    downloads = multi_download(
-        client,
-        "/user/com/train-25/model",
-        "/home/xx/data1",
-        1,
-        5,
-        100,
-        multi_processes=5)
-
-    multi_upload(client, "/user/com/train-25/model", "/home/xx/data1")
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
deleted file mode 100644
index 7d30de565e7a4..0000000000000
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""lookup_table_utils.py will move to fluid/incubate/fleet/utils/lookup_table.py"""
-
-from __future__ import print_function
-
-import os
-import time
-import logging
-
-import paddle
-from paddle.fluid import core
-from paddle.fluid import io
-from paddle.fluid import Program
-from paddle.fluid.log_helper import get_logger
-
-__all__ = [
-    "load_persistables_for_increment", "load_persistables_for_inference",
-    "convert_dist_to_sparse_program"
-]
-
-_logger = get_logger(
-    'lookup_table_utils',
-    logging.INFO,
-    fmt='%(asctime)s-%(levelname)s: %(message)s')
-
-model_filename = "__model__"
-lookup_table_dir = "__lookup_table__"
-
-
-def __insert_lookup_sparse_table_op(main_program, idx, ids, w, out):
-    main_program.global_block()._insert_op(
-        index=idx,
-        type="lookup_sparse_table",
-        inputs={"Ids": [ids],
-                "W": [w]},
-        outputs={"Out": [out]},
-        attrs={
-            "is_distributed": False,
-            "is_sparse": True,
-            "grad_inplace": False
-        })
-
-
-def __get_prefetch_op_tuples(main_program):
-    # current lookup tables op is split_ids->prefetch->merge_ids
-    prefetch_op_tuples = None
-    op_types = [op.type for op in main_program.global_block().ops]
-
-    for i in range(len(op_types)):
-        if op_types[i] == "prefetch":
-            if op_types[i - 1] == "split_ids" and op_types[i +
-                                                           1] == "merge_ids":
-                split_ids_op_id = i - 1
-                split_ids_inputs = main_program.global_block().ops[i - 1].input(
-                    "Ids")
-                prefetch_op_inputs = main_program.global_block().ops[i].input(
-                    "X")
-                prefetch_op_outputs = main_program.global_block().ops[i].output(
-                    "Out")
-                merge_ids_outputs = main_program.global_block().ops[
-                    i + 1].output("Out")
-
-                need_delete_vars = []
-                need_delete_vars.extend(prefetch_op_inputs)
-                need_delete_vars.extend(prefetch_op_outputs)
-
-                prefetch_op_tuples = (split_ids_op_id, split_ids_inputs,
-                                      merge_ids_outputs, need_delete_vars)
-                break
-    return prefetch_op_tuples
-
-
-def convert_dist_to_sparse_program(program):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    when we train model with distributed lookup table but want to do the local inference, we can use
-    this function to convert the train program with distributed lookup table to sparse lookup table.
-
-    Args:
-        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    Returns:
-        program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
-    """
-    if not program._distributed_lookup_table:
-        _logger.warn(
-            "There are no distributed lookup tables need to be converted")
-        return
-
-    # create table param and grad var in pserver program
-    origin_emb_var = "{}.origin".format(program._distributed_lookup_table)
-    emb_var = program._distributed_lookup_table
-    program.global_block()._rename_var(emb_var, origin_emb_var)
-    origin_param_var = program.global_block().vars[origin_emb_var]
-
-    param_var = program.global_block().create_var(
-        name=emb_var,
-        shape=origin_param_var.shape,
-        dtype=origin_param_var.dtype,
-        type=core.VarDesc.VarType.SELECTED_ROWS,
-        persistable=True)
-    # parameter must be selected rows
-    param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-    program._sync_with_cpp()
-
-    prefetch_op_tuples = __get_prefetch_op_tuples(program)
-
-    split_ids_id = prefetch_op_tuples[0]
-
-    for idx in range(split_ids_id + 2, split_ids_id - 1, -1):
-        program.global_block()._remove_op(idx)
-    program.desc.flush()
-
-    in_out_pairs = zip(prefetch_op_tuples[1], prefetch_op_tuples[2])
-
-    for in_out_pair in in_out_pairs:
-        idx = split_ids_id
-        ids = program.global_block().vars[in_out_pair[0]]
-        out = program.global_block().vars[in_out_pair[1]]
-        __insert_lookup_sparse_table_op(program, idx, ids, param_var, out)
-        program.desc.flush()
-    return program
-
-
-def load_persistables_for_increment(dirname, executor, program,
-                                    lookup_table_var, lookup_table_var_path):
-    """
-    WARNING: this function will only be used for distributed training with distributed lookup table.
-    for increment training, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of sliced lookup table
-    var with HASH, we must load the correct sliced var.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var: the distributed lookup tables var name.
-        lookup_table_var_path: the the distributed lookup tables var location.
-
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, need_load_vars):
-        load_prog = Program()
-        load_block = load_prog.global_block()
-        need_delete_vars = []
-
-        for param in need_load_vars:
-            origin_var = param.origin
-            slice_var = param.slice
-            is_slice = param.is_slice
-            offset = param.offset
-
-            if is_slice:
-                origin = load_block.create_var(
-                    name="{}.load".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-                slice = load_block.create_var(
-                    name=slice_var.name,
-                    type=slice_var.type,
-                    shape=slice_var.shape,
-                    dtype=slice_var.dtype,
-                    persistable=True)
-
-                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
-                start = int(offset / dim1_flatten)
-                end = int(offset / dim1_flatten + slice.shape[0])
-
-                load_block.append_op(
-                    type="slice",
-                    inputs={'Input': origin},
-                    outputs={'Out': slice},
-                    attrs={'axes': [0],
-                           'starts': [start],
-                           'ends': [end]})
-
-                need_delete_vars.append(origin)
-            else:
-                origin = load_block.create_var(
-                    name="{}".format(origin_var.name),
-                    type=origin_var.type,
-                    shape=origin_var.shape,
-                    dtype=origin_var.dtype,
-                    persistable=True)
-                load_block.append_op(
-                    type='load',
-                    inputs={},
-                    outputs={'Out': [origin]},
-                    attrs={
-                        'file_path': os.path.join(dirname, origin_var.name)
-                    })
-
-        load_block.append_op(
-            type='delete_var',
-            inputs={'X': need_delete_vars}, )
-
-        executor.run(load_prog)
-
-    def __load_lookup_table_vars(executor, main_program, lookup_table_var,
-                                 lookup_table_var_path):
-        emb_var = main_program.global_block().var(lookup_table_var)
-
-        load_program = Program()
-        load_block = load_program.global_block()
-        load_block.append_op(
-            type='load',
-            inputs={},
-            outputs={'Out': [emb_var]},
-            attrs={'file_path': lookup_table_var_path})
-        executor.run(load_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if not os.path.exists(lookup_table_var_path):
-        raise ValueError("There is no file named '%s'", lookup_table_var_path)
-
-    if not isinstance(program, Program):
-        raise ValueError("program must be an instance of fluid.Program")
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
-        program._ps_endpoint)
-    _load_persistable_vars(executor, dirname, need_load_vars)
-    __load_lookup_table_vars(executor, program, lookup_table_var,
-                             lookup_table_var_path)
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-
-def load_persistables_for_inference(dirname, executor, program,
-                                    lookup_table_var_name):
-    """
-    WARNING: this function will only be used for inference with distributed lookup table.
-    Inference with distributed lookup table is a little funky, this function will load distributed
-    lookup table vars into sparse var, can be used in local inference mode.
-
-    Args:
-        dirname(str): The directory path
-        executor(Executor): The executor to run for loading inference model.
-        program(Program): The parameter server program, which will run on Pserver.
-        lookup_table_var_name: the distributed lookup tables var name.
-    Returns:
-        None
-    """
-
-    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-        def _is_checkpoint_var(exclude_fluid_vars=None):
-            """
-            the checkpoint will not save or load all the variables.
-            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-            : param var(Variable)
-            """
-
-            if exclude_fluid_vars is None:
-                exclude_fluid_vars = []
-
-            def is_valid(var):
-                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                        var.desc.type() == core.VarDesc.VarType.RAW:
-                    return False
-                # @GRAD are named for gradient variables, checkpoint will not save it.
-                if "@GRAD" in var.name:
-                    return False
-                # .trainer_ are named for distribute train variables, checkpoint will not save it.
-                if ".trainer_" in var.name:
-                    return False
-
-                # .block is named for distribute train variables, checkpoint will not save it.
-                if ".block" in var.name:
-                    return False
-
-                if "tmp_" in var.name:
-                    return False
-
-                if var.name in exclude_fluid_vars:
-                    return False
-
-                return var.persistable
-
-            return is_valid
-
-        io.load_vars(
-            executor,
-            dirname=dirname,
-            main_program=program,
-            predicate=_is_checkpoint_var(lookup_table_vars),
-            filename=None)
-
-    def _load_lookup_table_vars(executor, dirname, main_program,
-                                lookup_table_vars):
-        if not os.path.isdir(dirname):
-            raise ValueError("There is no directory named '%s'", dirname)
-
-        lookup_table_dirname = os.path.join(dirname, lookup_table_dir)
-
-        emb_var_name = lookup_table_vars[0]
-        emb_var = main_program.global_block().var(emb_var_name)
-
-        emb_files = []
-        for emb_name in os.listdir(lookup_table_dirname):
-            if emb_var_name in emb_name:
-                emb_files.append(emb_name)
-
-        convert_program = Program()
-        global_block = convert_program.global_block()
-
-        emb_var = global_block.create_var(
-            name=emb_var.name,
-            shape=emb_var.shape,
-            dtype=emb_var.dtype,
-            type=core.VarDesc.VarType.SELECTED_ROWS,
-            persistable=True)
-        emb_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-
-        sums = []
-
-        for i, emb_file in enumerate(emb_files):
-            var_name = "{}_{}".format(emb_var.name, i)
-            param_var = global_block.create_var(
-                name=var_name,
-                shape=emb_var.shape,
-                dtype=emb_var.dtype,
-                type=core.VarDesc.VarType.SELECTED_ROWS,
-                persistable=True)
-            param_var.desc.set_type(core.VarDesc.VarType.SELECTED_ROWS)
-            global_block.append_op(
-                type='load',
-                inputs={},
-                outputs={'Out': [param_var]},
-                attrs={
-                    'file_path': os.path.join(lookup_table_dirname, var_name)
-                })
-            sums.append(param_var)
-        global_block.append_op(
-            type='merge_sparse_lookup_table',
-            inputs={"X": sums},
-            outputs={'Out': emb_var},
-            attrs={})
-        global_block.append_op(
-            type='save',
-            inputs={"X": [emb_var]},
-            outputs={},
-            attrs={
-                'file_path': os.path.join(lookup_table_dirname, emb_var.name)
-            })
-        global_block.append_op(type='delete_var', inputs={'X': sums})
-        executor.run(convert_program)
-
-    if not os.path.isdir(dirname):
-        raise ValueError("There is no directory named '%s'", dirname)
-
-    if program:
-        if not isinstance(program, Program):
-            raise ValueError("program must be an instance of fluid.Program")
-    else:
-        local_model = os.path.join(dirname, model_filename)
-
-        with open(local_model, "rb") as f:
-            program_desc_str = f.read()
-
-        program = Program.parse_from_string(program_desc_str)
-
-        if not core._is_program_version_supported(program._version()):
-            raise ValueError("Unsupported program version: %d\n" %
-                             program._version())
-
-    _logger.info("Start Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
-
-    _logger.info("Finish Load Sparse Program With "
-                 "Distributed Lookup Table Vars from {}, time = {}".format(
-                     dirname, time.ctime()))
-
-    return program
-
-
-def get_inference_model(main_program, feeded_var_names, target_vars):
-    """
-    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
-    and then add `feeded_vars` and `target_vars` in this program.
-
-    Args:
-        main_program(Program|None): The original program, which will be pruned to
-                                    build the inference model. If is set None,
-                                    the default main program will be used.
-                                    Default: None.
-        feeded_var_names(list[str]): Names of variables that need to be fed data
-                                     during inference.
-        target_vars(list[Variable]): Variables from which we can get inference
-                                     results.
-    Returns:
-        program(Program)
-
-    Raises:
-        ValueError: If `feed_var_names` is not a list of basestring.
-        ValueError: If `target_vars` is not a list of Variable.
-
-    """
-
-    def prepend_feed_ops(inference_program,
-                         feed_target_names,
-                         feed_holder_name='feed'):
-        if len(feed_target_names) == 0:
-            return
-
-        global_block = inference_program.global_block()
-
-        feed_var = global_block.create_var(
-            name=feed_holder_name,
-            type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
-
-        for i, name in enumerate(feed_target_names):
-            out = global_block.var(name)
-            global_block._prepend_op(
-                type='feed',
-                inputs={'X': [feed_var]},
-                outputs={'Out': [out]},
-                attrs={'col': i})
-
-    def append_fetch_ops(inference_program,
-                         fetch_target_names,
-                         fetch_holder_name='fetch'):
-        global_block = inference_program.global_block()
-        fetch_var = global_block.create_var(
-            name=fetch_holder_name,
-            type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
-
-        for i, name in enumerate(fetch_target_names):
-            global_block.append_op(
-                type='fetch',
-                inputs={'X': [name]},
-                outputs={'Out': [fetch_var]},
-                attrs={'col': i})
-
-    origin_program = main_program.clone()
-    main_program = main_program.clone()
-    global_block = main_program.global_block()
-
-    need_to_remove_op_index = []
-    for i, op in enumerate(global_block.ops):
-        op.desc.set_is_target(False)
-        if op.type == "feed" or op.type == "fetch":
-            need_to_remove_op_index.append(i)
-
-    for index in need_to_remove_op_index[::-1]:
-        global_block._remove_op(index)
-
-    main_program.desc.flush()
-
-    main_program = main_program._prune(targets=target_vars)
-    main_program = main_program._inference_optimize(prune_read_op=True)
-
-    fetch_var_names = [v.name for v in target_vars]
-
-    prepend_feed_ops(main_program, feeded_var_names)
-    append_fetch_ops(main_program, fetch_var_names)
-
-    return main_program
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index d3dc26c946df4..49bcaf6dd608c 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -270,6 +270,10 @@ def to_list(s):
         from .core_avx import _load_static_dict
         from .core_avx import _save_dygraph_dict
         from .core_avx import _load_dygraph_dict
+        from .core_avx import _save_lod_tensor
+        from .core_avx import _load_lod_tensor
+        from .core_avx import _save_selected_rows
+        from .core_avx import _load_selected_rows
         from .core_avx import _create_loaded_parameter
         from .core_avx import _cuda_synchronize
         from .core_avx import _promote_types_if_complex_exists
@@ -325,6 +329,10 @@ def to_list(s):
         from .core_noavx import _load_static_dict
         from .core_noavx import _save_dygraph_dict
         from .core_noavx import _load_dygraph_dict
+        from .core_noavx import _save_lod_tensor
+        from .core_noavx import _load_lod_tensor
+        from .core_noavx import _save_selected_rows
+        from .core_noavx import _load_selected_rows
         from .core_noavx import _create_loaded_parameter
         from .core_noavx import _cuda_synchronize
         from .core_noavx import _promote_types_if_complex_exists
diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index e46083295d1ce..bf3d0a81f9948 100755
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -97,10 +97,10 @@ class IterableDataset(Dataset):
         .. code-block:: python
 
             import numpy as np
-            from paddle.io import Dataset
+            from paddle.io import IterableDataset
             
             # define a random dataset
-            class RandomDataset(Dataset):
+            class RandomDataset(IterableDataset):
                 def __init__(self, num_samples):
                     self.num_samples = num_samples
             
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 7bcd10a726949..7fed27ee45978 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -433,7 +433,10 @@ def _gen_worker_desc(self, trainer_desc):
         # cfg.program_desc.CopyFrom(program.program._get_desc())
         place = pipeline_opt["place"]
         place_id = pipeline_opt["place_id"]
-        assert isinstance(place, core.CUDAPlace)
+        if core.is_compiled_with_cuda():
+            assert isinstance(place, core.CUDAPlace)
+        elif core.is_compiled_with_npu():
+            assert isinstance(place, core.NPUPlace)
         cfg.place = cfg.CUDAPlace
         cfg.place_id = place_id
 
diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py
index e80bc1245f9ce..345b71d8999eb 100644
--- a/python/paddle/fluid/dygraph/container.py
+++ b/python/paddle/fluid/dygraph/container.py
@@ -69,6 +69,8 @@ def __init__(self, *layers):
     def __getitem__(self, name):
         if isinstance(name, slice):
             return self.__class__(*(list(self._sub_layers.values())[name]))
+        elif isinstance(name, str):
+            return self._sub_layers[name]
         else:
             if name >= len(self._sub_layers):
                 raise IndexError('index {} is out of range'.format(name))
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index d06a5c890feeb..ca5e5606e432b 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -323,7 +323,7 @@ def scale_loss(loss):
 
 @imperative_base.no_grad
 @framework.dygraph_only
-def construct_groups(vars, group_size):
+def build_groups(vars, group_size):
     group_idx = 0
     memory_counter = 0
     var_groups = OrderedDict()
@@ -334,7 +334,7 @@ def construct_groups(vars, group_size):
         if memory_counter < group_size and dtype == var.dtype:
             memory_counter += bytes
         else:
-            memory_counter = 0
+            memory_counter = bytes
             dtype = var.dtype
             group_idx += 1
         var_groups.setdefault(group_idx, []).append(var)
@@ -361,7 +361,7 @@ def sync_params_buffers(model,
         return
 
     # group size is 128M
-    coalesced_vars = construct_groups(model_vars, 128 * 1024 * 1024)
+    coalesced_vars = build_groups(model_vars, 128 * 1024 * 1024)
 
     for coalesced_var, _, _ in coalesced_vars:
         paddle.distributed.broadcast(
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 64209aee875ba..11bc150b281aa 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -21,7 +21,7 @@
 from .. import framework
 from .. import core
 from .. import unique_name
-from ..framework import Variable, Parameter, ParamBase
+from ..framework import Variable, Parameter, ParamBase, _getitem_impl_
 from .base import switch_to_static_graph
 from .math_op_patch import monkey_patch_math_varbase
 from .parallel import scale_loss
@@ -437,6 +437,31 @@ def __bool__(self):
     def __array__(self, dtype=None):
         return self.numpy().astype(dtype)
 
+    def __getitem__(self, item):
+        def contain_tensor(item):
+            if not isinstance(item, tuple):
+                item = [item]
+
+            for slice_item in item:
+                if isinstance(slice_item, slice):
+                    if isinstance(slice_item.start, Variable)  \
+                        or isinstance(slice_item.stop, Variable) \
+                           or isinstance(slice_item.step, Variable):
+                        return True
+                else:
+                    if isinstance(slice_item, Variable):
+                        return True
+            return False
+
+        if contain_tensor(item):
+            # 1. Call _getitem_impl_ when item contains tensor.
+            # Why not call a c++ function ? Because item can't be parsed when it contains tensor.
+            return _getitem_impl_(self, item)
+
+        else:
+            # 2. Call c++ func getitem_index_not_tensor to speedup.
+            return self._getitem_index_not_tensor(item)
+
     for method_name, method in (
         ("__bool__", __bool__), ("__nonzero__", __nonzero__),
         ("_to_static_var", _to_static_var), ("set_value", set_value),
@@ -445,7 +470,8 @@ def __array__(self, dtype=None):
         ("gradient", gradient), ("register_hook", register_hook),
         ("__str__", __str__), ("__repr__", __str__),
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
-        ("__name__", "Tensor"), ("__array__", __array__)):
+        ("__name__", "Tensor"), ("__array__", __array__),
+        ("__getitem__", __getitem__)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 76bc68f24d2fe..62a9c42ee0a61 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1451,8 +1451,12 @@ def _run_from_dataset(self,
             for var in program.global_block().vars.values():
                 if var.is_data:
                     data_vars.append(var)
-            dataset = paddle.fluid.DatasetFactory().create_dataset(
-                'FileInstantDataset')
+            if core.is_compiled_with_npu():
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'InMemoryDataset')
+            else:
+                dataset = paddle.fluid.DatasetFactory().create_dataset(
+                    'FileInstantDataset')
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5845a2c78ecf9..ccfec944a7940 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2254,7 +2254,8 @@ class Operator(object):
         'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
         'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
         'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute'
+        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
+        'copy_cross_scope'
     }
 
     def __init__(self,
@@ -6073,7 +6074,8 @@ def device_guard(device=None):
     A context manager that specifies the device on which the OP will be placed.
 
     Args:
-        device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu',
+        device(str|None): Specify the device to use in the context. It should be ``cpu``,
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py
deleted file mode 100644
index 8d31a68e8083d..0000000000000
--- a/python/paddle/fluid/incubate/data_generator/__init__.py
+++ /dev/null
@@ -1,375 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-__all__ = ['MultiSlotDataGenerator', 'MultiSlotStringDataGenerator']
-
-
-class DataGenerator(object):
-    """
-    DataGenerator is a general Base class for user to inherit
-    A user who wants to define his/her own python processing logic
-    with paddle.fluid.dataset should inherit this class.
-    """
-
-    def __init__(self):
-        self._proto_info = None
-        self.batch_size_ = 32
-
-    def _set_line_limit(self, line_limit):
-        if not isinstance(line_limit, int):
-            raise ValueError("line_limit%s must be in int type" %
-                             type(line_limit))
-        if line_limit < 1:
-            raise ValueError("line_limit can not less than 1")
-        self._line_limit = line_limit
-
-    def set_batch(self, batch_size):
-        '''
-        Set batch size of current DataGenerator
-        This is necessary only if a user wants to define generator_batch
-        
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", int_words)
-                        return local_iter
-
-                    def generate_batch(self, samples):
-                        def local_iter():
-                            for s in samples:
-                                yield ("words", s[1].extend([s[1][0]]))
-                mydata = MyData()
-                mydata.set_batch(128)
-                    
-        '''
-        self.batch_size_ = batch_size
-
-    def run_from_memory(self):
-        '''
-        This function generator data from memory, it is usually used for
-        debug and benchmarking
-
-        Example:
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            yield ("words", [1, 2, 3, 4])
-                        return local_iter
-
-                mydata = MyData()
-                mydata.run_from_memory()
-        '''
-        batch_samples = []
-        line_iter = self.generate_sample(None)
-        for user_parsed_line in line_iter():
-            if user_parsed_line == None:
-                continue
-            batch_samples.append(user_parsed_line)
-            if len(batch_samples) == self.batch_size_:
-                batch_iter = self.generate_batch(batch_samples)
-                for sample in batch_iter():
-                    sys.stdout.write(self._gen_str(sample))
-                batch_samples = []
-        if len(batch_samples) > 0:
-            batch_iter = self.generate_batch(batch_samples)
-            for sample in batch_iter():
-                sys.stdout.write(self._gen_str(sample))
-
-    def run_from_stdin(self):
-        '''
-        This function reads the data row from stdin, parses it with the
-        process function, and further parses the return value of the 
-        process function with the _gen_str function. The parsed data will
-        be wrote to stdout and the corresponding protofile will be
-        generated.
-
-        Example:
-        
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", [int_words])
-                        return local_iter
-
-                mydata = MyData()
-                mydata.run_from_stdin()
-
-        '''
-        batch_samples = []
-        for line in sys.stdin:
-            line_iter = self.generate_sample(line)
-            for user_parsed_line in line_iter():
-                if user_parsed_line == None:
-                    continue
-                batch_samples.append(user_parsed_line)
-                if len(batch_samples) == self.batch_size_:
-                    batch_iter = self.generate_batch(batch_samples)
-                    for sample in batch_iter():
-                        sys.stdout.write(self._gen_str(sample))
-                    batch_samples = []
-        if len(batch_samples) > 0:
-            batch_iter = self.generate_batch(batch_samples)
-            for sample in batch_iter():
-                sys.stdout.write(self._gen_str(sample))
-
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the datafeed,and
-        updating proto_info information.
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the datafeed.
-        '''
-        raise NotImplementedError(
-            "pls use MultiSlotDataGenerator or PairWiseDataGenerator")
-
-    def generate_sample(self, line):
-        '''
-        This function needs to be overridden by the user to process the 
-        original data row into a list or tuple.
-
-        Args:
-            line(str): the original data row
-
-        Returns:
-            Returns the data processed by the user.
-              The data format is list or tuple: 
-            [(name, [feasign, ...]), ...] 
-              or ((name, [feasign, ...]), ...)
-             
-            For example:
-            [("words", [1926, 08, 17]), ("label", [1])]
-              or (("words", [1926, 08, 17]), ("label", [1]))
-
-        Note:
-            The type of feasigns must be in int or float. Once the float
-            element appears in the feasign, the type of that slot will be
-            processed into a float.
-
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", [int_words])
-                        return local_iter
-
-        '''
-        raise NotImplementedError(
-            "Please rewrite this function to return a list or tuple: " +
-            "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)")
-
-    def generate_batch(self, samples):
-        '''
-        This function needs to be overridden by the user to process the
-        generated samples from generate_sample(self, str) function
-        It is usually used as batch processing when a user wants to
-        do preprocessing on a batch of samples, e.g. padding according to
-        the max length of a sample in the batch
-
-        Args:
-            samples(list tuple): generated sample from generate_sample
-
-        Returns:
-            a python generator, the same format as return value of generate_sample
-
-        Example:
-
-            .. code-block:: python
-                import paddle.fluid.incubate.data_generator as dg
-                class MyData(dg.DataGenerator):
-
-                    def generate_sample(self, line):
-                        def local_iter():
-                            int_words = [int(x) for x in line.split()]
-                            yield ("words", int_words)
-                        return local_iter
-
-                    def generate_batch(self, samples):
-                        def local_iter():
-                            for s in samples:
-                                yield ("words", s[1].extend([s[1][0]]))
-                mydata = MyData()
-                mydata.set_batch(128)
-        '''
-
-        def local_iter():
-            for sample in samples:
-                yield sample
-
-        return local_iter
-
-
-# TODO: guru4elephant
-# add more generalized DataGenerator that can adapt user-defined slot
-# for example, [(name, float_list), (name, str_list), (name, int_list)]
-class MultiSlotStringDataGenerator(DataGenerator):
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info information.
-
-        The input line will be in this format:
-            >>> [(name, [str(feasign), ...]), ...]
-            >>> or ((name, [str(feasign), ...]), ...)
-        The output will be in this format:
-            >>> [ids_num id1 id2 ...] ...
-
-        For example, if the input is like this:
-            >>> [("words", ["1926", "08", "17"]), ("label", ["1"])]
-            >>> or (("words", ["1926", "08", "17"]), ("label", ["1"]))
-        the output will be:
-            >>> 3 1234 2345 3456 1 1
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the MultiSlotDataFeed.
-        '''
-        if not isinstance(line, list) and not isinstance(line, tuple):
-            raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Examples: [('words', ['1926', '08', '17']), ('label', ['1'])]")
-        output = ""
-        for index, item in enumerate(line):
-            name, elements = item
-            if output:
-                output += " "
-            out_str = []
-            out_str.append(str(len(elements)))
-            out_str.extend(elements)
-            output += " ".join(out_str)
-        return output + "\n"
-
-
-class MultiSlotDataGenerator(DataGenerator):
-    def _gen_str(self, line):
-        '''
-        Further processing the output of the process() function rewritten by
-        user, outputting data that can be directly read by the MultiSlotDataFeed,
-        and updating proto_info information.
-
-        The input line will be in this format:
-            >>> [(name, [feasign, ...]), ...] 
-            >>> or ((name, [feasign, ...]), ...)
-        The output will be in this format:
-            >>> [ids_num id1 id2 ...] ...
-        The proto_info will be in this format:
-            >>> [(name, type), ...]
-        
-        For example, if the input is like this:
-            >>> [("words", [1926, 08, 17]), ("label", [1])]
-            >>> or (("words", [1926, 08, 17]), ("label", [1]))
-        the output will be:
-            >>> 3 1234 2345 3456 1 1
-        the proto_info will be:
-            >>> [("words", "uint64"), ("label", "uint64")]
-
-        Args:
-            line(str): the output of the process() function rewritten by user.
-
-        Returns:
-            Return a string data that can be read directly by the MultiSlotDataFeed.
-        '''
-        if not isinstance(line, list) and not isinstance(line, tuple):
-            raise ValueError(
-                "the output of process() must be in list or tuple type"
-                "Example: [('words', [1926, 08, 17]), ('label', [1])]")
-        output = ""
-
-        if self._proto_info is None:
-            self._proto_info = []
-            for item in line:
-                name, elements = item
-                if not isinstance(name, str):
-                    raise ValueError("name%s must be in str type" % type(name))
-                if not isinstance(elements, list):
-                    raise ValueError("elements%s must be in list type" %
-                                     type(elements))
-                if not elements:
-                    raise ValueError(
-                        "the elements of each field can not be empty, you need padding it in process()."
-                    )
-                self._proto_info.append((name, "uint64"))
-                if output:
-                    output += " "
-                output += str(len(elements))
-                for elem in elements:
-                    if isinstance(elem, float):
-                        self._proto_info[-1] = (name, "float")
-                    elif not isinstance(elem, int) and not isinstance(elem,
-                                                                      long):
-                        raise ValueError(
-                            "the type of element%s must be in int or float" %
-                            type(elem))
-                    output += " " + str(elem)
-        else:
-            if len(line) != len(self._proto_info):
-                raise ValueError(
-                    "the complete field set of two given line are inconsistent.")
-            for index, item in enumerate(line):
-                name, elements = item
-                if not isinstance(name, str):
-                    raise ValueError("name%s must be in str type" % type(name))
-                if not isinstance(elements, list):
-                    raise ValueError("elements%s must be in list type" %
-                                     type(elements))
-                if not elements:
-                    raise ValueError(
-                        "the elements of each field can not be empty, you need padding it in process()."
-                    )
-                if name != self._proto_info[index][0]:
-                    raise ValueError(
-                        "the field name of two given line are not match: require<%s>, get<%s>."
-                        % (self._proto_info[index][0], name))
-                if output:
-                    output += " "
-                output += str(len(elements))
-                for elem in elements:
-                    if self._proto_info[index][1] != "float":
-                        if isinstance(elem, float):
-                            self._proto_info[index] = (name, "float")
-                        elif not isinstance(elem, int) and not isinstance(elem,
-                                                                          long):
-                            raise ValueError(
-                                "the type of element%s must be in int or float"
-                                % type(elem))
-                    output += " " + str(elem)
-        return output + "\n"
diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py
deleted file mode 100644
index dcacd67e92a88..0000000000000
--- a/python/paddle/fluid/incubate/data_generator/test_data_generator.py
+++ /dev/null
@@ -1,36 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-from __init__ import *
-
-
-class SyntheticData(MultiSlotDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", [1, 2, 3, 4]), ("label", [0])
-
-        return data_iter
-
-
-class SyntheticStringData(MultiSlotStringDataGenerator):
-    def generate_sample(self, line):
-        def data_iter():
-            for i in range(10000):
-                yield ("words", ["1", "2", "3", "4"], ("label", ["0"]))
-
-
-sd = SyntheticData()
-sd.run_from_memory()
-
-sd2 = SyntheticStringData()
-sd.run_from_memory()
diff --git a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py b/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
deleted file mode 100644
index a9fd8ac74f428..0000000000000
--- a/python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.contrib.utils import HDFSClient
-import os
-import time
-
-
-def check_all_trainers_ready(ready_path, epoch):
-    trainer_num = fleet.worker_num()
-    trainer_id = fleet.worker_index()
-
-    hadoop_home = os.getenv("HADOOP_HOME")
-    configs = {
-        "fs.default.name": os.getenv("FS_NAME"),
-        "hadoop.job.ugi": os.getenv("FS_UGI")
-    }
-
-    node_ready = "ready.{}.{}.done".format(epoch, trainer_id)
-
-    with open(node_ready, "w") as node:
-        node.write("")
-
-    client = HDFSClient(hadoop_home, configs)
-    if not client.is_dir(ready_path):
-        client.makedirs(ready_path)
-    client.upload(
-        hdfs_path=ready_path,
-        local_path=node_ready,
-        overwrite=True,
-        retry_times=0)
-
-    print("PUT {} ON HDFS {} OK".format(node_ready, ready_path))
-
-    while True:
-        ready_num = len(client.ls(ready_path))
-        print("have {} trainers need to be ready".format(trainer_num - ready_num
-                                                         % trainer_num))
-        if ready_num % trainer_num == 0:
-            break
-        time.sleep(10)
-        ready_num = len(client.ls(ready_path))
-
-    print("All trainers are ready, continue training")
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index c5345c7fed235..dc153614fcd26 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -152,8 +152,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        # Initialization Ops should be prepended and not appended
-        op = block._prepend_op(
+        op = block.append_op(
             type="fill_constant",
             outputs={"Out": out_var},
             attrs={
@@ -242,7 +241,6 @@ def __call__(self, var, block=None):
                                  ["uint16", "float16", "float32", "float64"],
                                  "uniform_random")
 
-        # Initialization Ops should be prepended and not appended
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -260,7 +258,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="uniform_random",
             inputs={},
             outputs={"Out": out_var},
@@ -334,7 +332,7 @@ def __call__(self, var, block=None):
         check_variable_and_dtype(var, "Out",
                                  ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -352,7 +350,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -418,7 +416,7 @@ def __call__(self, var, block=None):
 
         assert isinstance(var, framework.Variable)
         assert isinstance(block, framework.Block)
-        # Initialization Ops should be prepended and not appended
+
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -436,7 +434,7 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             out_var = var
 
-        op = block._prepend_op(
+        op = block.append_op(
             type="truncated_gaussian_random",
             outputs={"Out": out_var},
             attrs={
@@ -557,7 +555,7 @@ def __call__(self, var, block=None):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -572,7 +570,7 @@ def __call__(self, var, block=None):
 
         else:
             std = np.sqrt(2.0 / float(fan_in + fan_out))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -688,7 +686,7 @@ def __call__(self, var, block=None):
 
         if self._uniform:
             limit = np.sqrt(6.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="uniform_random",
                 inputs={},
                 outputs={"Out": out_var},
@@ -703,7 +701,7 @@ def __call__(self, var, block=None):
 
         else:
             std = np.sqrt(2.0 / float(fan_in))
-            op = block._prepend_op(
+            op = block.append_op(
                 type="gaussian_random",
                 outputs={"Out": out_var},
                 attrs={
@@ -920,7 +918,6 @@ def __call__(self, var, block=None):
             out_dtype = var.dtype
             np_value = self._value
 
-        # Initialization Ops should be prepended and not appended
         if out_dtype == VarDesc.VarType.FP32:
             value_name = "fp32_values"
             values = [float(v) for v in np_value.flat]
@@ -932,7 +929,7 @@ def __call__(self, var, block=None):
         if self._value.size > 1024 * 1024 * 1024:
             raise ValueError("The size of input is too big. Please consider "
                              "saving it to file and 'load_op' to load it")
-        op = block._prepend_op(
+        op = block.append_op(
             type='assign_value',
             outputs={'Out': out_var},
             attrs={
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 858078615a752..e9738b6660eea 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -381,7 +381,10 @@ def create_parameter(self,
             return self.main_program.global_block().create_parameter(
                 dtype=dtype, shape=shape, type=type, **attr._to_kwargs())
 
-    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+    def create_variable_for_type_inference(self,
+                                           dtype,
+                                           stop_gradient=False,
+                                           shape=None):
         """Create a temporary variable that should be type inferred layer.
 
         Note:
@@ -397,6 +400,7 @@ def create_variable_for_type_inference(self, dtype, stop_gradient=False):
             name=unique_name.generate_with_ignorable_key(".".join(
                 [self.name, 'tmp'])),
             dtype=dtype,
+            shape=shape,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
             stop_gradient=stop_gradient)
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 841daf7a41d1f..67cdc6dce5a82 100755
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -27,6 +27,7 @@
 
 __activations_noattr__ = [
     'sigmoid',
+    'silu',
     'logsigmoid',
     'tanh_shrink',
     'softplus',
@@ -100,6 +101,20 @@
 
 """)
 
+add_sample_code(globals()["silu"], r"""
+Examples:
+    .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x)
+        print(out)
+        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
+
+""")
+
 add_sample_code(globals()["logsigmoid"], r"""
 Examples:
     .. code-block:: python
diff --git a/python/paddle/fluid/layers/sequence_lod.py b/python/paddle/fluid/layers/sequence_lod.py
index df1113660f7d8..a42ec2c92a3aa 100644
--- a/python/paddle/fluid/layers/sequence_lod.py
+++ b/python/paddle/fluid/layers/sequence_lod.py
@@ -139,10 +139,11 @@ def sequence_conv(input,
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
 
-             x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-             x_conved = fluid.layers.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
+             x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+             x_conved = paddle.static.nn.sequence_conv(input=x, num_filters=2, filter_size=3, padding_start=-1)
     """
 
     assert not in_dygraph_mode(), (
@@ -233,15 +234,17 @@ def sequence_softmax(input, use_cudnn=False, name=None):
     Examples:
 
         .. code-block:: python
-
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[7, 1],
+             
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[7, 1],
                               dtype='float32', lod_level=1)
-             x_sequence_softmax_1 = fluid.layers.sequence_softmax(input=x)  
+             x_sequence_softmax_1 = paddle.static.nn.sequence_softmax(input=x)  
 
-             y = fluid.data(name='y', shape=[7],
+             y = paddle.static.data(name='y', shape=[7],
                  dtype='float32', lod_level=1)
-             x_sequence_softmax_2 = fluid.layers.sequence_softmax(input=y)  
+             x_sequence_softmax_2 = paddle.static.nn.sequence_softmax(input=y)  
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -334,15 +337,16 @@ def sequence_pool(input, pool_type, is_test=False, pad_value=0.0):
 
         .. code-block:: python
 
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
-            sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
-            sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
-            max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
-            last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
-            first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            avg_x = paddle.static.nn.sequence_pool(input=x, pool_type='average')
+            sum_x = paddle.static.nn.sequence_pool(input=x, pool_type='sum')
+            sqrt_x = paddle.static.nn.sequence_pool(input=x, pool_type='sqrt')
+            max_x = paddle.static.nn.sequence_pool(input=x, pool_type='max')
+            last_x = paddle.static.nn.sequence_pool(input=x, pool_type='last')
+            first_x = paddle.static.nn.sequence_pool(input=x, pool_type='first')
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -413,10 +417,12 @@ def sequence_concat(input, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
-            y = fluid.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
-            out = fluid.layers.sequence_concat(input=[x, y])
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[-1, 10], dtype='float32', lod_level=1)
+            y = paddle.static.data(name='y', shape=[-1, 10], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_concat(input=[x, y])
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -481,9 +487,11 @@ def sequence_first_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_first_step = fluid.layers.sequence_first_step(input=x)
+             import paddle
+             paddle.enable_static()
+
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_first_step = paddle.static.nn.sequence_first_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_first_step')
@@ -538,9 +546,11 @@ def sequence_last_step(input):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
-             x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-             x_last_step = fluid.layers.sequence_last_step(input=x)
+             import paddle
+             paddle.enable_static()
+             
+             x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+             x_last_step = paddle.static.nn.sequence_last_step(input=x)
     """
     check_variable_and_dtype(input, 'input', ['float32', 'float64'],
                              'sequence_last_step')
@@ -598,13 +608,15 @@ def sequence_slice(input, offset, length, name=None):
 
         .. code-block:: python
 
-             import paddle.fluid as fluid
+             import paddle
+             paddle.enable_static()
+             
              import numpy as np
-             seqs = fluid.data(name='x', shape=[10, 5],
+             seqs = paddle.static.data(name='x', shape=[10, 5],
                               dtype='float32', lod_level=1)
-             offset = fluid.layers.assign(input=np.array([[0, 1]]).astype("int32"))
-             length = fluid.layers.assign(input=np.array([[2, 1]]).astype("int32"))
-             subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
+             offset = paddle.assign(np.array([[0, 1]]).astype("int32"))
+             length = paddle.assign(np.array([[2, 1]]).astype("int32"))
+             subseqs = paddle.static.nn.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
     assert not in_dygraph_mode(), (
@@ -715,17 +727,18 @@ def sequence_expand(x, y, ref_level=-1, name=None):
     Examples:
         .. code-block:: python
 	
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+            from paddle import fluid
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1],
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1],
                         dtype='float32', lod_level=1)
-            out = layers.sequence_expand(x=x, y=y, ref_level=0)
+            out = paddle.static.nn.sequence_expand(x=x, y=y, ref_level=0)
 
-            exe = fluid.Executor(fluid.CPUPlace())
-            place = fluid.CPUPlace()
+            exe = paddle.static.Executor(fluid.CPUPlace())
+            place = paddle.CPUPlace()
 
             np_data = np.array([[1], [2], [3], [4]]).astype('float32')
             x_lod_tensor = fluid.create_lod_tensor(np_data, [[2, 2]], place)
@@ -836,13 +849,14 @@ def sequence_expand_as(x, y, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            paddle.enable_static()
             import numpy as np
 
-            x = fluid.data(name='x', shape=[4, 1], dtype='float32')
-            y = fluid.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
-            out = layers.sequence_expand_as(x=x, y=y)
+            x = paddle.static.data(name='x', shape=[4, 1], dtype='float32')
+            y = paddle.static.data(name='y', shape=[8, 1], dtype='float32', lod_level=1)
+            out = paddle.static.nn.sequence_expand_as(x=x, y=y)
 
             exe = fluid.Executor(fluid.CPUPlace())
             place = fluid.CPUPlace()
@@ -969,13 +983,15 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(
-                input=numpy.array([0.0], dtype=numpy.float32))
-            out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(
+                numpy.array([0.0], dtype=numpy.float32))
+            out = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
     """
 
     assert not in_dygraph_mode(), (
@@ -1048,16 +1064,18 @@ def sequence_unpad(x, length, name=None):
     Examples:
         .. code-block:: python
 
+            import paddle
+            paddle.enable_static()
             import paddle.fluid as fluid
             import numpy
 
             # pad data
-            x = fluid.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
-            pad_value = fluid.layers.assign(input=numpy.array([0.0], dtype=numpy.float32))
-            pad_data, len = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
+            x = paddle.static.data(name='x', shape=[10, 5], dtype='float32', lod_level=1)
+            pad_value = paddle.assign(numpy.array([0.0], dtype=numpy.float32))
+            pad_data, len = paddle.static.nn.sequence_pad(x=x, pad_value=pad_value)
             
             # unpad data
-            unpad_data = fluid.layers.sequence_unpad(x=pad_data, length=len)
+            unpad_data = paddle.static.nn.sequence_unpad(x=pad_data, length=len)
     """
 
     assert not in_dygraph_mode(), (
@@ -1123,9 +1141,11 @@ def sequence_reshape(input, new_dim):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
-            x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=4)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 16], dtype='float32', lod_level=1)
+            x_reshaped = paddle.static.nn.sequence_reshape(input=x, new_dim=4)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1200,12 +1220,13 @@ def sequence_scatter(input, index, updates, name=None):
 
         .. code-block:: python
 	
-            import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
 
-            input = fluid.data( name="x", shape=[None, 3, 6], dtype='float32' )
-            index = fluid.data( name='index', shape=[12, 1],  dtype='int64', lod_level=1)
-            updates = fluid.data( name='updates', shape=[12, 1], dtype='float32', lod_level=1)
-            output = fluid.layers.sequence_scatter(input, index, updates)
+            input = paddle.static.data(name="x", shape=[None, 3, 6], dtype='float32' )
+            index = paddle.static.data(name='index', shape=[12, 1],  dtype='int64', lod_level=1)
+            updates = paddle.static.data(name='updates', shape=[12, 1], dtype='float32', lod_level=1)
+            output = paddle.static.nn.sequence_scatter(input, index, updates)
 
     """
     assert not in_dygraph_mode(), (
@@ -1279,10 +1300,11 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-
-            x = fluid.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
-            out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
+            import paddle
+            paddle.enable_static()
+            
+            x = paddle.static.data(name='x', shape=[-1, 1], dtype='int32', lod_level=1)
+            out = paddle.static.nn.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
@@ -1333,26 +1355,30 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
             Tensor or LodTensor with shape [d_1, d_2, ..., d_n].
         maxlen (int, optional): Maximum length of the sequence. If :code:`maxlen` \
                            is None, it would be replace with :math:`max(x)`.
-        dtype (np.dtype|core.VarDesc.VarType|str, optional): Data type of the output, \
+        dtype (np.dtype|paddle.dtype|str, optional): Data type of the output, \
              ``int64`` by default.
         name(str, optional): For detailed information, please refer \
             to :ref:`api_guide_Name`. Usually name is no need to set and \
             None by default.
 
-    Returns: The output sequence mask. Tensor or LodTensor with shape [d_1, d_2, ..., d_n, maxlen] \
-            and data type of :code:`dtype`. The data type should be float32, float64, int8, \
+    Returns: The output sequence mask. Tensor with shape [d_1, d_2, ..., d_n, maxlen] \
+            and data type of :code:`dtype`. The data type should be bool, float32, float64, int8, \
             int32 or int64.
 
-    Return Type: Variable
+    Return Type: Tensor
 
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            import paddle.fluid.layers as layers
+            import paddle
+
+            lengths = paddle.to_tensor([10, 9, 8])
+            mask = paddle.nn.functional.sequence_mask(lengths)
 
-            x = fluid.data(name='x', shape=[10], dtype='float32', lod_level=1)
-            mask = layers.sequence_mask(x=x)
+            print(mask.numpy())
+            # [[1 1 1 1 1 1 1 1 1 1]
+            #  [1 1 1 1 1 1 1 1 1 0]
+            #  [1 1 1 1 1 1 1 1 0 0]]
 
     """
     helper = LayerHelper('sequence_mask', **locals())
@@ -1414,9 +1440,11 @@ def sequence_reverse(x, name=None):
     Examples:
         .. code-block:: python
 
-            import paddle.fluid as fluid
-            x = fluid.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
-            x_reversed = fluid.layers.sequence_reverse(x)
+            import paddle
+            paddle.enable_static()
+
+            x = paddle.static.data(name='x', shape=[None, 10], dtype='float32', lod_level=1)
+            x_reversed = paddle.static.nn.sequence_reverse(x)
     """
     assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 7458466b02fd4..3e2c06f69cfbd 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import math
 import numpy
 import six
 import warnings
@@ -546,8 +547,10 @@ def assign(input, output=None):
     The OP copies the :attr:`input` to the :attr:`output`.
 
     Parameters:
-        input (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        input (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
+            Note: the float64 data will be converted to float32 because of current platform protobuf
+            data limitation.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
 
@@ -569,9 +572,15 @@ def assign(input, output=None):
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input', (Variable, numpy.ndarray), 'assign')
+    check_type(input, 'input', (Variable, numpy.ndarray, list, tuple, float,
+                                int, bool), 'assign')
     is_inplace = True if output is not None else False
 
+    if numpy.isscalar(input) and not isinstance(input, str):
+        input = numpy.array([input])
+    elif isinstance(input, (list, tuple)):
+        input = numpy.array(input)
+
     if isinstance(input, Variable):
         check_dtype(
             input.dtype, 'input',
@@ -584,6 +593,14 @@ def assign(input, output=None):
             type='assign', inputs={'X': [input]}, outputs={'Out': [output]})
     elif isinstance(input, numpy.ndarray):
         dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == VarDesc.VarType.FP64:
+            # Setting FP64 numpy data is not supported in Paddle, so we
+            # use FP32 here
+            warnings.warn(
+                "paddle.assign doesn't support float64 input now due "
+                "to current platform protobuf data limitation, we convert "
+                "it to float32")
+            dtype = VarDesc.VarType.FP32
         if dtype == VarDesc.VarType.BOOL:
             value_name = "bool_values"
             values = [bool(v) for v in input.flat]
@@ -1373,6 +1390,11 @@ def range(start, end, step, dtype, name=None):
     if not isinstance(dtype, core.VarDesc.VarType):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
+    out_shape = None
+    if not isinstance(start, Variable) and not isinstance(
+            end, Variable) and not isinstance(step, Variable):
+        out_shape = [int(math.ceil((end - start) / step))]
+
     if not isinstance(start, Variable):
         with device_guard("cpu"):
             start = fill_constant([1], dtype, start, force_cpu=True)
@@ -1397,7 +1419,7 @@ def range(start, end, step, dtype, name=None):
     check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
                 'range/arange')
     helper = LayerHelper('range', **locals())
-    out = helper.create_variable_for_type_inference(dtype)
+    out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
     helper.append_op(
         type='range',
         inputs={'Start': start,
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index c47cce76f8984..e8f8bdd3f9add 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -16,6 +16,7 @@
 import six
 from . import layers
 from .data_feeder import check_variable_and_dtype, convert_dtype
+from ..utils import deprecated
 
 __all__ = [
     "simple_img_conv_pool",
@@ -332,6 +333,7 @@ def sequence_conv_pool(input,
     return pool_out
 
 
+@deprecated(since="2.0.0", update_to="paddle.nn.functional.glu")
 def glu(input, dim=-1):
     r"""
 	:api_attr: Static Graph
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index e112247198231..21b4c429a66e9 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4818,7 +4818,10 @@ def device_cmp(device1, device2):
         place_list = []
         for dev in device_list:
             dev_index = int(dev.split(":")[1])
-            place_list.append(core.CUDAPlace(0))
+            if core.is_compiled_with_cuda():
+                place_list.append(core.CUDAPlace(dev_index % 1))
+            elif core.is_compiled_with_npu():
+                place_list.append(core.NPUPlace(dev_index % 1))
 
         # Step6: Split startup program
         new_startup_program = self._split_startup_program(startup_program,
@@ -4837,7 +4840,10 @@ def device_cmp(device1, device2):
             self._accumulate_gradients(real_block)
             real_block._sync_with_cpp()
 
-        place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        if core.is_compiled_with_cuda():
+            place_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+        elif core.is_compiled_with_npu():
+            place_id = int(os.getenv("FLAGS_selected_npus", "0"))
         main_program._pipeline_opt = {
             "trainer": "PipelineTrainer",
             "device_worker": "Section",
diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py
index 5e0e5f724a889..db08955c455fb 100644
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
@@ -28,10 +28,12 @@ def _create_regularization_of_grad(param, grad, regularization=None):
     Function helper of append_regularization_ops.
     """
     # If no gradient or no regularization is specified,  then we don't need to do anything
-    if grad is None or (param.regularizer is None and regularization is None):
+    if grad is None or ((not hasattr(param, 'regularizer') or (
+            hasattr(param, 'regularizer') and param.regularizer is None)) and
+                        regularization is None):
         return grad
     regularization_term = None
-    if param.regularizer is not None:
+    if hasattr(param, 'regularizer') and param.regularizer is not None:
         # Add variable for regularization term in grad block
         regularization_term = param.regularizer(param, grad, grad.block)
     elif regularization is not None:
@@ -213,7 +215,7 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         inputs = {"X": [param]}
@@ -320,7 +322,7 @@ def __call__(self, param, grad, block):
         Returns:
             new variable for weight decay
         """
-        assert isinstance(param, framework.Parameter)
+        assert isinstance(param, framework.Variable)
         assert isinstance(block, framework.Block)
 
         if framework.in_dygraph_mode():
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
index db97e86385ae4..dddb14eb78e8a 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py
@@ -247,7 +247,8 @@ def train_model(self, device, use_custom_op=False, use_pe=False):
         paddle.set_device(device)
 
         with paddle.static.scope_guard(paddle.static.Scope()):
-            with paddle.static.program_guard(paddle.static.Program()):
+            with paddle.static.program_guard(paddle.static.Program(),
+                                             paddle.static.Program()):
                 x = paddle.static.data(
                     shape=[None, self.in_dim], name='x', dtype='float32')
                 y = paddle.static.data(
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index b267617451772..642e93ebcb85e 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -142,7 +142,8 @@ def setUp(self):
             cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
                 cur_dir)
         else:
-            cmd = 'cd {} && python custom_relu_setup.py install'.format(cur_dir)
+            cmd = 'cd {} && {} custom_relu_setup.py install'.format(
+                cur_dir, sys.executable)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -254,6 +255,35 @@ def test_static_save_and_load_inference_model(self):
                     format(predict, predict_infer))
         paddle.disable_static()
 
+    def test_static_save_and_run_inference_predictor(self):
+        paddle.enable_static()
+        np_data = np.random.random((1, 1, 28, 28)).astype("float32")
+        np_label = np.random.random((1, 1)).astype("int64")
+        path_prefix = "custom_op_inference/custom_relu"
+        from paddle.inference import Config
+        from paddle.inference import create_predictor
+        for device in self.devices:
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix)
+            # load inference model
+            config = Config(path_prefix + ".pdmodel",
+                            path_prefix + ".pdiparams")
+            predictor = create_predictor(config)
+            input_tensor = predictor.get_input_handle(predictor.get_input_names(
+            )[0])
+            input_tensor.reshape(np_data.shape)
+            input_tensor.copy_from_cpu(np_data.copy())
+            predictor.run()
+            output_tensor = predictor.get_output_handle(
+                predictor.get_output_names()[0])
+            predict_infer = output_tensor.copy_to_cpu()
+            self.assertTrue(
+                np.isclose(
+                    predict, predict_infer, rtol=5e-5).any(),
+                "custom op predict: {},\n custom op infer predict: {}".format(
+                    predict, predict_infer))
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index d0906052c999f..3bf96944edbf7 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -11,6 +11,7 @@ endif()
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
+list(APPEND DIST_TEST_OPS test_static_model_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
@@ -21,7 +22,8 @@ list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel)
-list(APPEND DIST_TEST_OPS test_parallel_dygraph_hybrid_parallel)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_layer)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -71,10 +73,14 @@ endforeach()
 
 if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_comm_init_all_op)
+    LIST(REMOVE_ITEM TEST_OPS test_c_concat)
+    LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
+    LIST(REMOVE_ITEM TEST_OPS test_c_identity)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
+    LIST(REMOVE_ITEM TEST_OPS test_pipeline_parallel)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
     LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
@@ -154,6 +160,8 @@ if(APPLE OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_metric)
 endif()
 
+list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+
 if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
     LIST(REMOVE_ITEM TEST_OPS test_rank_attention_op) # TODO(shenliang03): rank_attention_op support CPU device in future
@@ -167,9 +175,11 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel)
-    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_hybrid_parallel)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_pipeline_layer)
+    list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
+    LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -741,6 +751,7 @@ set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 250)
 set_tests_properties(test_pylayer_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_paddle_save_load_binary PROPERTIES TIMEOUT 120)
 if (WIN32)
     set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 900)
     set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250)
@@ -814,7 +825,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
-set_tests_properties(test_activation_op PROPERTIES TIMEOUT 180)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
@@ -851,7 +862,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_parallel_dygraph_hybrid_parallel PROPERTIES TIMEOUT 120 LABELS "RUN_TYPE=DIST")
+    set_tests_properties(test_parallel_dygraph_pipeline_layer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
@@ -864,14 +876,19 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
     set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120)
     if(WITH_DISTRIBUTE)
         set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+        set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240)
     endif()
     set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
     set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_concat PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_split PROPERTIES TIMEOUT 120)
     set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_identity PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
     set_tests_properties(test_collective_scatter PROPERTIES TIMEOUT 120)
@@ -883,6 +900,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32)
         test_collective_scatter_api
         test_collective_barrier_api
         test_collective_reduce_api
+        test_pipeline_parallel
         test_collective_allreduce_api
         test_new_group_api
         test_collective_broadcast_api
diff --git a/python/paddle/fluid/tests/unittests/collective_concat_op.py b/python/paddle/fluid/tests/unittests/collective_concat_op.py
new file mode 100644
index 0000000000000..c9de1713e7282
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_concat_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveConcat(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofconcat",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_concat",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveConcat, "concat", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_identity_op.py b/python/paddle/fluid/tests/unittests/collective_identity_op.py
new file mode 100644
index 0000000000000..e024b64e82509
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_identity_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveIdentity(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofgather",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_identity",
+                inputs={'X': tindata},
+                outputs={'Out': toutdata},
+                attrs={'ring_id': ring_id,
+                       'nranks': nranks})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveIdentity, "identity", 0)
diff --git a/python/paddle/fluid/tests/unittests/collective_split_op.py b/python/paddle/fluid/tests/unittests/collective_split_op.py
new file mode 100644
index 0000000000000..553955354fe02
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_split_op.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveAllGather(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = 0
+        nranks = 2
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float32')
+            toutdata = main_prog.current_block().create_var(
+                name="outofsplit",
+                dtype='float32',
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                persistable=False,
+                stop_gradient=False)
+            main_prog.global_block().append_op(
+                type="c_split",
+                inputs={'X': tindata},
+                attrs={
+                    'ring_id': ring_id,
+                    'rank': self.rank,
+                    'nranks': nranks
+                },
+                outputs={'Out': toutdata})
+            return toutdata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveAllGather, "split", 0)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
index 29b4f1b05f9c2..630b804f9a2fb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py
@@ -31,6 +31,10 @@
 SEED = 2020
 np.random.seed(SEED)
 
+# TODO(zhhsplendid): This test is old so that use a static graph style
+# mark it as TODO, to refactoring the code of this file.
+paddle.enable_static()
+
 
 def dyfunc_to_variable(x):
     res = fluid.dygraph.to_variable(x, name=None, zero_copy=None)
@@ -54,11 +58,27 @@ def dyfunc_to_tensor(x):
     return res3
 
 
+def dyfunc_int_to_tensor(x):
+    res = paddle.to_tensor(3)
+    return res
+
+
+def dyfunc_float_to_tensor(x):
+    res = paddle.to_tensor(2.0)
+    return res
+
+
+def dyfunc_bool_to_tensor(x):
+    res = paddle.to_tensor(True)
+    return res
+
+
 class TestDygraphBasicApi_ToVariable(unittest.TestCase):
     def setUp(self):
         self.input = np.ones(5).astype("int32")
         self.test_funcs = [
-            dyfunc_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
+            dyfunc_to_tensor, dyfunc_bool_to_tensor, dyfunc_int_to_tensor,
+            dyfunc_float_to_tensor, dyfunc_to_variable, dyfunc_to_variable_2,
             dyfunc_to_variable_3
         ]
         self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index 528e388f6a2e2..bb95bdf9fc677 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -97,7 +97,7 @@ def __setattr__(self, name, value):
 # derived learning rate the to get the final learning rate.
 cfg.learning_rate = 0.001
 # maximum number of iterations
-cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 2
+cfg.max_iter = 20 if fluid.is_compiled_with_cuda() else 1
 # Disable mixup in last N iter
 cfg.no_mixup_iter = 10 if fluid.is_compiled_with_cuda() else 1
 # warm up to learning rate 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
new file mode 100644
index 0000000000000..248c271eec6a1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_amp.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import paddle.distributed.fleet as fleet
+import unittest
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def train_batch(self, batch, model, optimizer, is_mp):
+        scaler = paddle.amp.GradScaler(init_loss_scaling=5160)
+        if is_mp:
+            scaler = fleet.distributed_scaler(scaler)
+        with paddle.amp.auto_cast():
+            output = model(batch)
+            loss = output.mean()
+
+        scaled = scaler.scale(loss)  # scale the loss
+        scaled.backward()  # do backward
+
+        scaler.minimize(optimizer, scaled)  # update parameters
+        optimizer.clear_grad()
+        return scaled
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
new file mode 100644
index 0000000000000..ad95aceaa2cf9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_clip_grad.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+from hybrid_parallel_mp_model import TestDistMPTraning
+import unittest
+import logging
+
+#log = logging.getLogger("HybridParallel")
+#log.setLevel(logging.WARNING)
+
+
+class TestMPClipGrad(TestDistMPTraning):
+    def build_optimizer(self, model):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        scheduler = paddle.optimizer.lr.ExponentialDecay(
+            learning_rate=0.001, gamma=0.999, verbose=True)
+        optimizer = paddle.optimizer.SGD(scheduler,
+                                         grad_clip=grad_clip,
+                                         parameters=model.parameters())
+        return optimizer
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index ed5b9060e5eba..dfbef998a2f07 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -173,9 +173,9 @@ def test_row_parallel_layer(self):
         self.word_size = self.hcg.get_model_parallel_world_size()
         self.rank_id = self.hcg.get_model_parallel_rank()
 
-        input_size_per_card = 17
+        input_size_per_card = 11
         input_size = input_size_per_card * self.model_parallel_size
-        output_size_per_card = 13
+        output_size_per_card = 10
         output_size = output_size_per_card * self.model_parallel_size
         batch_size = 4
 
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index 0336f9220ab8c..767bf5d57e74a 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -21,7 +21,6 @@
 import paddle.distributed as dist
 import paddle.fluid as fluid
 import paddle.distributed.fleet as fleet
-import paddle.fluid.generator as generator
 from paddle.io import DataLoader, Dataset
 import unittest
 
@@ -143,7 +142,7 @@ def __getitem__(self, index):
         return np_input_data
 
 
-class TestDistTraning(unittest.TestCase):
+class TestDistMPTraning(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         self.model_parallel_size = 2
@@ -155,7 +154,20 @@ def setUp(self):
         }
         fleet.init(is_collective=True, strategy=strategy)
 
-    def test_mp_model(self):
+    def train_batch(self, batch, model, optimizer, is_mp):
+        output = model(batch)
+        loss = output.mean()
+        loss.backward()  # do backward
+        optimizer.step()  # update parameters
+        optimizer.clear_grad()
+        return loss
+
+    def build_optimizer(self, model):
+        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
+                                         parameters=model.parameters())
+        return optimizer
+
+    def build_model_optimizer(self):
         hcg = fleet.get_hybrid_communicate_group()
         word_size = hcg.get_model_parallel_world_size()
         mp_id = hcg.get_model_parallel_rank()
@@ -182,31 +194,29 @@ def test_mp_model(self):
 
         model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2, mp_id)
-        optimizer_a = paddle.optimizer.SGD(learning_rate=0.001,
-                                           parameters=model_a.parameters())
+        optimizer_a = self.build_optimizer(model_a)
         model_a = fleet.distributed_model(model_a)
         optimizer_a = fleet.distributed_optimizer(optimizer_a)
 
         model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size,
                               np_fc1, np_fc2)
-        optimizer_b = paddle.optimizer.SGD(learning_rate=0.001,
-                                           parameters=model_b.parameters())
+        optimizer_b = self.build_optimizer(model_b)
+
+        return model_a, optimizer_a, model_b, optimizer_b, train_data_loader
+
+    def test_mp_model(self):
+        model_a, optimizer_a, model_b, optimizer_b, train_data_loader = self.build_model_optimizer(
+        )
 
         for step, batch in enumerate(train_data_loader):
             if step > 5:
                 return
-            output_a = model_a(batch)
-            loss_a = output_a.mean()
-            loss_a.backward()
-            optimizer_a.step()
-            optimizer_a.clear_grad()
-
-            output_b = model_b(batch)
-            loss_b = output_b.mean()
-            loss_b.backward()
-            optimizer_b.step()
-            optimizer_b.clear_grad()
-            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
+
+            loss_a = self.train_batch(batch, model_a, optimizer_a, True)
+            loss_b = self.train_batch(batch, model_b, optimizer_b, False)
+
+            np.testing.assert_allclose(
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
new file mode 100644
index 0000000000000..3130cbf458467
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_layer.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle
+from paddle.distributed import fleet
+import copy
+from paddle.fluid.dygraph.container import Sequential
+import paddle.nn as nn
+from paddle.fluid.dygraph.layers import Layer
+from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
+import paddle.nn.functional as F
+import unittest
+
+
+class AlexNet(Layer):
+    def __init__(self, num_classes=10):
+        super(AlexNet, self).__init__()
+        self.features = Sequential(
+            nn.Conv2D(
+                3, 64, kernel_size=11, stride=4, padding=5),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                64, 192, kernel_size=5, padding=2),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2),
+            nn.Conv2D(
+                192, 384, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                384, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.Conv2D(
+                256, 256, kernel_size=3, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2D(
+                kernel_size=2, stride=2), )
+        self.classifier = nn.Linear(256, num_classes)
+        self.loss_fn = nn.loss.CrossEntropyLoss()
+
+    def forward(self, x, y):
+        x = self.features(x)
+        x.flatten()
+
+        x = self.classifier(x)
+        return self.loss_fn(x, y)
+
+
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        feat = [self.features[i] for i in range(len(self.features))]
+        loss_fn = [lambda x: x.flatten(), self.classifier]
+        feat.extend(loss_fn)
+        return feat
+
+
+class AlexNetPipeDesc(PipelineLayer):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        decs = [
+            LayerDesc(
+                nn.Conv2D, 3, 64, kernel_size=11, stride=4, padding=5),
+            LayerDesc(nn.ReLU),
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 64, 192, kernel_size=5, padding=2),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            LayerDesc(
+                nn.Conv2D, 192, 384, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 384, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.Conv2D, 256, 256, kernel_size=3, padding=1),
+            F.relu,
+            LayerDesc(
+                nn.MaxPool2D, kernel_size=2, stride=2),
+            lambda x: x.flatten(),
+            LayerDesc(nn.Linear, 256, self.num_classes),  # classifier
+        ]
+        super(AlexNetPipeDesc, self).__init__(
+            layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+
+
+class TestPipeLayerAPI(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": 1,
+            "mp_degree": 1,
+            "pp_degree": self.model_parallel_size
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+        self.hcg = fleet.get_hybrid_communicate_group()
+
+    def test_pipelayer_desc(self):
+        pipe_model = AlexNetPipeDesc(num_stages=self.model_parallel_size)
+        np.testing.assert_array_equal(len(pipe_model.parameters()), 6)
+
+    def test_pipelayer_sequential(self):
+        init_net = AlexNetPipe()
+        pipe_model = PipelineLayer(
+            layers=init_net.to_layers(),
+            num_stages=self.model_parallel_size,
+            loss_fn=nn.CrossEntropyLoss())
+        stage_id = self.hcg.get_stage_id()
+        init_parameters = init_net.parameters()
+        pipe_parameters = pipe_model.parameters()
+        part_number = len(init_parameters) // 2
+
+        if stage_id == 0:
+            for idx in range(part_number):
+                param_a = init_parameters[idx]
+                param_b = pipe_parameters[idx]
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+        elif stage_id == 1:
+            for idx in range(part_number):
+                param_a = init_parameters[idx + part_number]
+                param_b = pipe_parameters[idx]
+
+                np.testing.assert_array_equal(param_a.name, param_b.name)
+                np.testing.assert_allclose(param_a.numpy(), param_b.numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
new file mode 100644
index 0000000000000..9b9283a1a9b6e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_model.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle.io import DataLoader, Dataset
+import unittest
+
+
+def set_random_seed(seed, dp_id, rank_id):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed + dp_id)
+    paddle.seed(seed + rank_id)
+
+
+HIDDEN_DIM = 32
+LAYERS = 8
+
+
+def sequential_model():
+    model = paddle.nn.Sequential(
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, HIDDEN_DIM),
+        paddle.nn.Linear(HIDDEN_DIM, 1), )
+    return model
+
+
+class TestDistPPTraning(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 2
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        strategy.pipeline_configs = {"accumulate_steps": 2}
+        paddle.distributed.init_parallel_env()
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_mp_model(self):
+        batch_input = paddle.randn(shape=(1, HIDDEN_DIM), dtype="float32")
+        pipe_model = sequential_model()
+        sgd = paddle.optimizer.SGD(learning_rate=0.0003, parameters=[])
+        pipe_model = paddle.distributed.fleet.distributed_model(pipe_model)
+
+        if pipe_model.stage_id == 0 or pipe_model.stage_id == 1:
+            pipe_input = batch_input.clone().detach()
+            pipe_input = paddle.cast(pipe_input, 'float32')
+
+            def data_gen():
+                gen = True
+                while gen:
+                    yield [pipe_input, 0]
+                    gen = False
+
+            loader = paddle.io.DataLoader.from_generator(capacity=5)
+            loader.set_batch_generator(data_gen)
+            data_iter = iter(loader)
+        else:
+            data_iter = None
+        return True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 48706bf5ad1fd..3daa50020bab2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -55,5 +55,182 @@ def test_check_output(self):
             self.check_output_with_option(use_gpu[i])
 
 
+class FCFusePassTRTDynamicDims2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims2Test.DynamicShapeParam(
+            {
+                'data': [1, 128]
+            }, {'data': [64, 128]}, {'data': [32, 128]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32]
+            }, {'data': [64, 128, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[32, 128, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims3Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims3Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 32, 32]
+            }, {'data': [64, 256, 32]}, {'data': [32, 128, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 12, 4, 6], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=1,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 12, 4, 6)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols1Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols1Test.DynamicShapeParam(
+            {
+                'data': [1, 12, 4, 6]
+            }, {'data': [64, 12, 4, 6]}, {'data': [32, 12, 4, 6]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=2,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols2Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols2Test.DynamicShapeParam(
+            {
+                'data': [1, 64, 32, 32]
+            }, {'data': [64, 256, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
+class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[32, 128, 32, 32], dtype="float32")
+            fc_out1 = fluid.layers.fc(input=data,
+                                      size=64,
+                                      num_flatten_dims=3,
+                                      act="relu")
+            out = fluid.layers.softmax(input=fc_out1)
+
+        self.feeds = {
+            "data": np.random.random((32, 128, 32, 32)).astype("float32")
+        }
+        self.enable_trt = True
+        self.trt_parameters = FCFusePassTRTDynamicDims4Cols3Test.TensorRTParam(
+            1 << 30, 32, 2, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = FCFusePassTRTDynamicDims4Cols3Test.DynamicShapeParam(
+            {
+                'data': [1, 128, 32, 32]
+            }, {'data': [64, 128, 32, 32]}, {'data': [32, 128, 32, 32]}, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        use_gpu = [False]
+        if core.is_compiled_with_cuda():
+            use_gpu.append(True)
+        for i in range(len(use_gpu)):
+            self.check_output_with_option(use_gpu[i])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index ac80ea4c62cbf..8828892dca3cc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -19,106 +19,128 @@
 from op_test import OpTest, skip_check_grad_ci
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid.contrib.mixed_precision.amp_nn import check_finite_and_unscale
 
 paddle.enable_static()
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOp(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([0]),
-            'Out': [('out0', x / scale)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOpWithNan(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        x[128][128] = np.nan
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([1]),
-            'Out': [('out0', x)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        # When input contains nan, do not check the output, 
-        # since the output may be nondeterministic and will be discarded.
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['Out'])
+class TestCheckFiniteAndUnscale(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_contains_nan(self):
+        a = np.zeros((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_contains_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(found_inf[0])
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.ones((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a / b) / scale[0]))
+        self.assertFalse(found_inf[0])
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
                  "core is not compiled with NPU")
-class TestCheckFiniteAndUnscaleOpWithInf(OpTest):
-    def setUp(self):
-        self.set_npu()
-        self.op_type = "check_finite_and_unscale"
-        self.place = paddle.NPUPlace(0)
-        self.init_dtype()
-        x = np.random.random((1024, 1024)).astype(self.dtype)
-        x[128][128] = np.inf
-        scale = np.random.random((1)).astype(self.dtype)
-
-        self.inputs = {'X': [('x0', x)], 'Scale': scale}
-        self.outputs = {
-            'FoundInfinite': np.array([1]),
-            'Out': [('out0', x)],
-        }
-
-    def set_npu(self):
-        self.__class__.use_npu = True
-
-    def init_kernel_type(self):
-        self.use_mkldnn = False
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        # When input contains inf, do not check the output, 
-        # since the output may be nondeterministic and will be discarded.
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['Out'])
+class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
+    def get_prog(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        with program_guard(main_program):
+            a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
+            b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
+            scale = paddle.static.data(name="scale", shape=[1], dtype='float32')
+            float_status = paddle.static.data(
+                name="status", shape=[8], dtype='float32')
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            c = paddle.fluid.layers.elementwise_div(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [c], scale, float_status=float_status)
+            main_program.global_block().append_op(
+                type="alloc_float_status",
+                outputs={"FloatStatus": float_status}, )
+            d = paddle.fluid.layers.elementwise_add(a, b)
+            out, found_inf = check_finite_and_unscale(
+                [d], scale, float_status=float_status)
+
+        return main_program, out, found_inf, float_status
+
+    def run_prog(self, a, b, scale):
+        main_program, out, found_inf, float_status = self.get_prog()
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place)
+        out_, founf_inf_, float_status_ = exe.run(
+            main_program,
+            feed={"a": a,
+                  "b": b,
+                  "scale": scale},
+            fetch_list=[out, found_inf, float_status])
+        print(float_status_)
+        return out_, founf_inf_
+
+    def test_not_contains_nan_inf(self):
+        a = np.ones((32, 32)).astype('float32')
+        b = np.zeros((32, 32)).astype('float32')
+        scale = np.array([2.0]).astype('float32')
+
+        out, found_inf = self.run_prog(a, b, scale)
+        print(out, found_inf)
+
+        self.assertTrue(np.allclose(out, (a + b) / scale[0]))
+        self.assertFalse(found_inf[0])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 37b446174d6d0..133367a5f3625 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -296,7 +296,7 @@ def check_grad_with_place(self,
             no_grad_set=no_grad_set)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
-        self._assert_is_close(a1, a3, inputs_to_check, 0.001,
+        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
                               "Gradient Check On cpu & xpu")
 
     def get_grad_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py
index 2e4b1828c5bbe..1f02562dcb4fb 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_net.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_net.py
@@ -173,7 +173,7 @@ def optimizer(learning_rate=0.01):
 def batch_size(use_device):
     if use_device == DeviceType.CUDA:
         # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
-        return 8
+        return 4
     return 12
 
 
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
new file mode 100644
index 0000000000000..416f6bc4f0d41
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_col.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_col = 0 if rank == 0 else OUT_SIZE // 2
+        np_weight_part = np_weight[:, start_col:start_col + OUT_SIZE // 2]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=1,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
new file mode 100644
index 0000000000000..4a98792f8a047
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_by_row.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
new file mode 100644
index 0000000000000..4a98792f8a047
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_embedding.py
@@ -0,0 +1,119 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import time
+import math
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+from functools import reduce
+from test_dist_base import TestDistRunnerBase, runtime_main
+import paddle.distributed.fleet as fleet
+
+paddle.enable_static()
+
+DTYPE = "float32"
+MODEL_PARALLEL_SIZE = 2
+IN_SIZE = 2 * MODEL_PARALLEL_SIZE
+OUT_SIZE = 2 * MODEL_PARALLEL_SIZE
+
+# Fix seed for test
+#fluid.default_startup_program().random_seed = 1
+#fluid.default_main_program().random_seed = 1
+
+
+def create_model(data, rank):
+    np.random.seed(2021)
+    np_weight = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE)
+    if rank is not None:
+        start_row = 0 if rank == 0 else IN_SIZE // 2
+        np_weight_part = np_weight[start_row:start_row + IN_SIZE // 2, :]
+        result = paddle.distributed.split(
+            data,
+            size=(IN_SIZE, OUT_SIZE),
+            operation='linear',
+            axis=0,
+            num_partitions=MODEL_PARALLEL_SIZE,
+            weight_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    np_weight_part)),
+            bias_attr=False, )
+    else:
+        result = fluid.layers.fc(
+            data,
+            size=OUT_SIZE,
+            param_attr=paddle.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(np_weight)),
+            bias_attr=False, )
+
+    predict = paddle.sum(result)
+    return predict
+
+
+class TestModelParallel(TestDistRunnerBase):
+    def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
+        # Input data
+        data_in = fluid.data(
+            name='data_in', shape=[batch_size, IN_SIZE], dtype=DTYPE)
+
+        if dist_strategy:
+            data_loader = fluid.io.DataLoader.from_generator(
+                feed_list=[data_in],
+                capacity=64,
+                use_double_buffer=False,
+                iterable=False)
+
+        if dist_strategy:
+            fleet.init(is_collective=True)
+            strategy = fleet.DistributedStrategy()
+            strategy.tensor_parallel = True
+            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}
+
+        rank = fleet.worker_index() if dist_strategy else None
+        avg_cost = create_model(data_in, rank)
+        opt = fluid.optimizer.SGD(0.1)
+
+        if dist_strategy:
+            dist_opt = fleet.distributed_optimizer(
+                optimizer=opt, strategy=strategy)
+            dist_opt.minimize(avg_cost)
+        else:
+            opt.minimize(avg_cost)
+
+        def gen_data():
+            np.random.seed(2021)
+            while True:
+                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
+                yield data
+
+        train_reader = paddle.batch(gen_data, batch_size=batch_size)
+
+        if dist_strategy:
+            return None, avg_cost, train_reader, None, None, None, data_loader
+        else:
+            return None, avg_cost, train_reader, None, None, None
+
+
+if __name__ == "__main__":
+    runtime_main(TestModelParallel)
diff --git a/python/paddle/fluid/tests/unittests/test_Tensor_type.py b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
new file mode 100644
index 0000000000000..59395b94279ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_Tensor_type.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+
+
+class TensorTypeTest(unittest.TestCase):
+    def test_type_totensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.to_tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_Tensor(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = paddle.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.tensor.logic.Tensor(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+    def test_type_core(self):
+        paddle.disable_static()
+        inx = np.array([1, 2])
+        tensorx = core.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+        tensorx = paddle.framework.VarBase(inx)
+        typex_str = str(type(tensorx))
+        expectx = "<class 'paddle.Tensor'>"
+        self.assertEqual((typex_str == expectx), True)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index ea183e9444878..92465c3e28401 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -119,6 +119,72 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', max_relative_error=0.01)
 
 
+class TestSilu(TestActivation):
+    def setUp(self):
+        self.op_type = "silu"
+        self.init_dtype()
+
+        np.random.seed(1024)
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = x / (np.exp(-x) + 1)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+
+class TestSiluAPI(unittest.TestCase):
+    # test paddle.nn.Silu, paddle.nn.functional.silu
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [11, 17]).astype('float32')
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda() \
+            else paddle.CPUPlace()
+
+    def test_static_api(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.fluid.data('X', [11, 17])
+            out1 = F.silu(x)
+            m = paddle.nn.Silu()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in res:
+            self.assertEqual(np.allclose(out_ref, r), True)
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.silu(x)
+        m = paddle.nn.Silu()
+        out2 = m(x)
+        out_ref = self.x_np / (1 + np.exp(-self.x_np))
+        for r in [out1, out2]:
+            self.assertEqual(np.allclose(out_ref, r.numpy()), True)
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.silu, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
+            self.assertRaises(TypeError, F.silu, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
+            F.silu(x_fp16)
+
+
 class TestLogSigmoid(TestActivation):
     def setUp(self):
         self.op_type = "logsigmoid"
@@ -2629,6 +2695,7 @@ def test_check_grad(self):
 
 create_test_act_fp16_class(TestActivation)
 create_test_act_fp16_class(TestSigmoid)
+create_test_act_fp16_class(TestSilu)
 create_test_act_fp16_class(TestLogSigmoid)
 create_test_act_fp16_class(TestTanh)
 create_test_act_fp16_class(TestTanhshrink)
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index 82ddafb8f956f..fe82b23b73bdb 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -94,10 +94,8 @@ def test_errors(self):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 class TestAssignOApi(unittest.TestCase):
@@ -157,6 +155,23 @@ def test_assign_NumpyArray3(self):
             paddle.assign(array, result1)
         self.assertTrue(np.allclose(result1.numpy(), array))
 
+    def test_assign_List(self):
+        paddle.disable_static()
+        l = [1, 2, 3]
+        result = paddle.assign(l)
+        self.assertTrue(np.allclose(result.numpy(), np.array(l)))
+        paddle.enable_static()
+
+    def test_assign_BasicTypes(self):
+        paddle.disable_static()
+        result1 = paddle.assign(2)
+        result2 = paddle.assign(3.0)
+        result3 = paddle.assign(True)
+        self.assertTrue(np.allclose(result1.numpy(), np.array([2])))
+        self.assertTrue(np.allclose(result2.numpy(), np.array([3.0])))
+        self.assertTrue(np.allclose(result3.numpy(), np.array([1])))
+        paddle.enable_static()
+
 
 class TestAssignOpErrorApi(unittest.TestCase):
     def test_errors(self):
@@ -169,10 +184,8 @@ def test_errors(self):
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, paddle.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, paddle.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, paddle.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_c_concat.py b/python/paddle/fluid/tests/unittests/test_c_concat.py
new file mode 100644
index 0000000000000..20f166af14c9c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_concat.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestConcatOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_concat(self, col_type="concat"):
+        self.check_with_place("collective_concat_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_identity.py b/python/paddle/fluid/tests/unittests/test_c_identity.py
new file mode 100644
index 0000000000000..c780f800d1ed5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_identity.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestIdentityOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_identity(self, col_type="identity"):
+        self.check_with_place("collective_identity_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_split.py b/python/paddle/fluid/tests/unittests/test_c_split.py
new file mode 100644
index 0000000000000..0a5d91e0625e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_split.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestSplitOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_split(self, col_type="split"):
+        self.check_with_place("collective_split_op.py", col_type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_clip_op.py b/python/paddle/fluid/tests/unittests/test_clip_op.py
index 2946798a82f78..b05100fc7b433 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
@@ -50,10 +50,14 @@ def setUp(self):
         self.outputs = {'Out': np.clip(self.inputs['X'], min_v, max_v)}
 
     def test_check_output(self):
+        paddle.enable_static()
         self.check_output()
+        paddle.disable_static()
 
     def test_check_grad_normal(self):
+        paddle.enable_static()
         self.check_grad(['X'], 'Out')
+        paddle.disable_static()
 
     def initTestCase(self):
         self.shape = (4, 10, 10)
@@ -102,6 +106,7 @@ def initTestCase(self):
 
 class TestClipOpError(unittest.TestCase):
     def test_errors(self):
+        paddle.enable_static()
         with program_guard(Program(), Program()):
             input_data = np.random.random((2, 4)).astype("float32")
 
@@ -115,6 +120,7 @@ def test_dtype():
                 fluid.layers.clip(x=x2, min=-1.0, max=1.0)
 
             self.assertRaises(TypeError, test_dtype)
+        paddle.disable_static()
 
 
 class TestClipAPI(unittest.TestCase):
@@ -140,7 +146,10 @@ def test_clip(self):
         out_8 = paddle.clip(images)
         out_9 = paddle.clip(paddle.cast(images, 'float64'), min=0.2, max=0.9)
 
-        res1, res2, res3, res4, res5, res6, res7, res8, res9 = exe.run(
+        out_10 = paddle.clip(paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_11 = paddle.clip(paddle.cast(images * 10, 'int64'), min=2, max=8)
+
+        res1, res2, res3, res4, res5, res6, res7, res8, res9, res10, res11 = exe.run(
             fluid.default_main_program(),
             feed={
                 "image": data,
@@ -148,7 +157,8 @@ def test_clip(self):
                 "max": np.array([0.8]).astype('float32')
             },
             fetch_list=[
-                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9
+                out_1, out_2, out_3, out_4, out_5, out_6, out_7, out_8, out_9,
+                out_10, out_11
             ])
 
         self.assertTrue(np.allclose(res1, data.clip(0.2, 0.8)))
@@ -161,8 +171,14 @@ def test_clip(self):
         self.assertTrue(np.allclose(res8, data))
         self.assertTrue(
             np.allclose(res9, data.astype(np.float64).clip(0.2, 0.9)))
+        self.assertTrue(
+            np.allclose(res10, (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(res11, (data * 10).astype(np.int64).clip(2, 8)))
+        paddle.disable_static()
 
     def test_clip_dygraph(self):
+        paddle.disable_static()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         paddle.disable_static(place)
@@ -176,9 +192,16 @@ def test_clip_dygraph(self):
         out_2 = paddle.clip(images, min=0.2, max=0.9)
         out_3 = paddle.clip(images, min=v_min, max=v_max)
 
+        out_4 = paddle.clip(paddle.cast(images * 10, 'int32'), min=2, max=8)
+        out_5 = paddle.clip(paddle.cast(images * 10, 'int64'), min=2, max=8)
+
         self.assertTrue(np.allclose(out_1.numpy(), data.clip(0.2, 0.8)))
         self.assertTrue(np.allclose(out_2.numpy(), data.clip(0.2, 0.9)))
         self.assertTrue(np.allclose(out_3.numpy(), data.clip(0.2, 0.8)))
+        self.assertTrue(
+            np.allclose(out_4.numpy(), (data * 10).astype(np.int32).clip(2, 8)))
+        self.assertTrue(
+            np.allclose(out_5.numpy(), (data * 10).astype(np.int64).clip(2, 8)))
 
     def test_errors(self):
         paddle.enable_static()
@@ -186,6 +209,7 @@ def test_errors(self):
         x2 = fluid.data(name='x2', shape=[1], dtype="int8")
         self.assertRaises(TypeError, paddle.clip, x=x1, min=0.2, max=0.8)
         self.assertRaises(TypeError, paddle.clip, x=x2, min=0.2, max=0.8)
+        paddle.disable_static()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index fc267ed914ec2..697e8d32d67a8 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -274,6 +274,11 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "identity":
+            need_result1 = input1
+            need_result2 = input2
+            self.assertTrue(np.allclose(tr0_out, need_result1, rtol=0, atol=0))
+            self.assertTrue(np.allclose(tr1_out, need_result2, rtol=0, atol=0))
         elif col_type == "reduce_slicegather":
             slicesize = input1.shape[0] // 2
             tmp10 = input1[0:slicesize]
@@ -284,5 +289,22 @@ def check_with_place(self,
             need_result2 = np.concatenate((tmp20, tmp21), axis=1)
             self.assertTrue(np.allclose(tr0_out, need_result1))
             self.assertTrue(np.allclose(tr1_out, need_result2))
+        elif col_type == "concat":
+            need_result = np.concatenate((input1, input2), axis=1)
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
+        elif col_type == "split":
+            need_result1 = np.split(input1, 2, axis=1)[0]
+            need_result2 = np.split(input2, 2, axis=1)[1]
+            self.assertTrue(
+                np.allclose(
+                    tr0_out, need_result1, rtol=1e-05, atol=1e-05))
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result2, rtol=1e-05, atol=1e-05))
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index fbf7384b86bc1..8dc80c8931269 100644
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -122,6 +122,23 @@ def test_broadcast_api_1(self):
                                fetch_list=[out])
             self.assertEqual((res == real_result).all(), True)
 
+        def test_broadcast_api_2(self):
+            paddle.enable_static()
+            with program_guard(Program(), Program()):
+                x = paddle.static.data(name='x', shape=[1, 2, 3], dtype='int32')
+                y = paddle.static.data(
+                    name='y', shape=[1, 2, 1, 3], dtype='int32')
+                op = eval("paddle.%s" % (self.op_type))
+                out = op(x, y)
+                exe = paddle.static.Executor(self.place)
+                input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(np.int32)
+                input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(np.int32)
+                real_result = callback(input_x, input_y)
+                res, = exe.run(feed={"x": input_x,
+                                     "y": input_y},
+                               fetch_list=[out])
+            self.assertEqual((res == real_result).all(), True)
+
         def test_attr_name(self):
             paddle.enable_static()
             with program_guard(Program(), Program()):
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
index d615f7cb7044e..f3878dfa2bc76 100644
--- a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
@@ -119,8 +119,8 @@ def check_dgc_momentum_optimizer(self,
         init_ops_count = 5 if name == "momentum" else 9
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), init_ops_count)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
         # check dgc op regularization coeff
         train_ops = program.global_block().ops
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
new file mode 100755
index 0000000000000..6de04c14bfa70
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+import random
+
+import paddle.fluid.layers as layers
+
+
+def get_fc_block(block_idx, input_size, is_last=False):
+    block_name = "block_" + str(block_idx)
+    block = paddle.nn.Sequential(
+        (block_name + "_fc_0", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+        (block_name + "_relu_1", paddle.nn.ReLU()),
+        (block_name + "_fc_1", paddle.nn.Linear(
+            input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()), )
+    if is_last:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, 1, bias_attr=False))  # add sublayer
+    else:
+        block.add_sublayer(
+            block_name + "_fc_2",
+            paddle.nn.Linear(
+                input_size, input_size, bias_attr=False))  # add sublayer
+    return block
+
+
+class Naive_fc_net(paddle.nn.Layer):
+    def __init__(self,
+                 input_size=10,
+                 recompute_blocks=[1, 3],
+                 recompute_kwargs={}):
+        super(Naive_fc_net, self).__init__()
+        self.recompute_blocks = recompute_blocks
+        self.recompute_kwargs = recompute_kwargs
+        self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+        self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+        self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+        self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+        self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+
+    def forward(self, inputs):
+
+        if 0 in self.recompute_blocks:
+            inputs = recompute(self.runfunc0, inputs)
+        else:
+            inputs = self.runfunc0(inputs)
+
+        if 1 in self.recompute_blocks:
+            inputs = recompute(self.runfunc1, inputs)
+        else:
+            inputs = self.runfunc1(inputs)
+
+        if 2 in self.recompute_blocks:
+            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
+        else:
+            inputs = self.runfunc2(inputs)
+
+        if 3 in self.recompute_blocks:
+            inputs = recompute(self.runfunc3, inputs)
+        else:
+            inputs = self.runfunc3(inputs)
+
+        if 4 in self.recompute_blocks:
+            inputs = recompute(self.runfunc4, inputs)
+        else:
+            inputs = self.runfunc4(inputs)
+
+        return inputs
+
+
+def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+    gen = paddle.seed(10)
+    gen.manual_seed(10)
+    np.random.seed(10)
+    random.seed(10)
+
+    if cuda_state:
+        paddle.set_cuda_rng_state(cuda_state)
+
+    batch_size, input_size = 1, 10
+    model = Naive_fc_net(
+        input_size,
+        recompute_blocks=recompute_block,
+        recompute_kwargs=recompute_kwargs)
+    loss_fn = paddle.nn.MSELoss(reduction='mean')
+    optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                     parameters=model.parameters())
+
+    loss_ = []
+    param_ = []
+    grad_ = []
+    for step in range(10):
+        x_data = np.random.randn(batch_size, input_size).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        # x.stop_gradient = False
+        y_pred = model(x)
+        loss = y_pred.mean()
+
+        loss_.append(np.asarray(loss).tolist())
+        loss.backward()
+        optimizer.step()
+
+        param_.append(np.asarray(model.parameters()[9]).tolist())
+        grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+
+        optimizer.clear_grad()
+    return loss_, param_, grad_
+
+
+class TestPyLayer(unittest.TestCase):
+    def test_fc_net_with_dropout(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
+        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(
+            cuda_state, recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_recompute_kwargs(self):
+        paddle.set_device("gpu")
+        kwargs = {"is_test": False}
+        with self.assertRaises(ValueError):
+            loss_ref, param_ref, grad_ref = run_model(
+                None, recompute_block=[2], recompute_kwargs=kwargs)
+
+    def test_recompute_cpu_rng(self):
+        paddle.set_device("cpu")
+        with self.assertRaises(RuntimeError):
+            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index 4d1e936558abf..be5e87b9d344b 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -524,13 +524,14 @@ def test_sharding_with_pp(self):
         # check program
         startup_prog_op_types = [op.type for op in startup_prog_ops]
         main_prog_op_types = [op.type for op in main_prog_ops]
+        print(startup_prog_op_types)
         self.assertEqual(startup_prog_op_types, [
+            'fill_constant', 'uniform_random', 'fill_constant',
+            'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
-            'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random',
-            'fill_constant', 'uniform_random', 'fill_constant', 'c_gen_nccl_id',
-            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
-            'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id',
-            'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
         ])
 
         self.assertEqual(main_prog_op_types, [
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_utils.py b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
index 51c12375948f5..09de4867ef9f4 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_utils.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_utils.py
@@ -24,7 +24,6 @@
 from paddle.dataset.common import download, DATA_HOME
 import paddle.fluid.incubate.fleet.base.role_maker as role_maker
 from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet
-from paddle.fluid.incubate.fleet.utils.fleet_barrier_util import check_all_trainers_ready
 from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
 import paddle.fluid.incubate.fleet.utils.utils as utils
 
@@ -50,15 +49,6 @@ def test_fleet_util_init(self):
         fleet_util_transpiler = FleetUtil(mode="transpiler")
         self.assertRaises(Exception, FleetUtil, "other")
 
-    def test_fleet_barrier(self):
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER,
-            worker_num=1,
-            server_endpoints=['127.0.0.1'])
-        fleet.init(role)
-        check_all_trainers_ready("/ready_path/", 0)
-
     def test_program_type_trans(self):
         data_dir = self.download_files()
         program_dir = os.path.join(data_dir, self.pruned_dir)
diff --git a/python/paddle/fluid/tests/unittests/test_glu.py b/python/paddle/fluid/tests/unittests/test_glu.py
index 63818d8ac50f2..25f1975db0c52 100644
--- a/python/paddle/fluid/tests/unittests/test_glu.py
+++ b/python/paddle/fluid/tests/unittests/test_glu.py
@@ -17,6 +17,9 @@
 import paddle.fluid.dygraph as dg
 import unittest
 
+import paddle
+from paddle.nn import functional as F
+
 
 def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-x))
@@ -48,5 +51,25 @@ def test_case(self):
             self.check_identity(fluid.CUDAPlace(0))
 
 
+class TestGLUV2(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(5, 20)
+        self.dim = -1
+        self.out = glu(self.x, self.dim)
+
+    def check_identity(self, place):
+        with dg.guard(place):
+            x_var = paddle.to_tensor(self.x)
+            y_var = F.glu(x_var, self.dim)
+            y_np = y_var.numpy()
+
+        np.testing.assert_allclose(y_np, self.out)
+
+    def test_case(self):
+        self.check_identity(fluid.CPUPlace())
+        if fluid.is_compiled_with_cuda():
+            self.check_identity(fluid.CUDAPlace(0))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
index 2b51bec9cb0e7..e528e742a277a 100644
--- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
+++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py
@@ -34,6 +34,7 @@ def test_import_paddle(self):
             with open(test_file, 'w') as wb:
                 cmd_test = """
 import paddle
+paddle.utils.run_check()
 x = paddle.rand([3,4])
 assert x.place.is_gpu_place() is False, "There is no CUDA device, but Tensor's place is CUDAPlace"
 """
@@ -52,7 +53,7 @@ def test_import_paddle(self):
             assert 'CPU device will be used by default' in str(
                 stderr
             ), "GPU version Paddle is installed. But CPU device can't be used when CUDA device is not set properly"
-            assert "Error" not in str(
+            assert "AssertionError" not in str(
                 stderr
             ), "There is no CUDA device, but Tensor's place is CUDAPlace"
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index ef2900be39c9a..a56797971b514 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -106,6 +106,20 @@ def func():
 
         self.assertRaises(ValueError, func)
 
+    def test_amp_guard_upsupported_fp16_op(self):
+        data = np.random.uniform(-1, 1, [10, 3, 32, 32]).astype('float32')
+        with fluid.dygraph.guard():
+            conv2d = fluid.dygraph.Conv2D(3, 2, 3, bias_attr=False, act=None)
+            data = fluid.dygraph.to_variable(data)
+            with fluid.dygraph.amp_guard(True):
+                out_fp16 = conv2d(data)
+                out_fp32 = paddle.expand_as(
+                    out_fp16, out_fp16)  # expand_as_v2 has no fp16 kernel
+
+        self.assertTrue(data.dtype == fluid.core.VarDesc.VarType.FP32)
+        self.assertTrue(out_fp16.dtype == fluid.core.VarDesc.VarType.FP16)
+        self.assertTrue(out_fp32.dtype == fluid.core.VarDesc.VarType.FP32)
+
 
 class TestAmpScaler(unittest.TestCase):
     def test_scale(self):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index e6e7b8222a4b3..9dae36c3c223f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -296,6 +296,28 @@ def test_paddle_imperative_no_grad_guard(self):
             self.assertTrue(tmp._grad_ivar() is None)
             self.assertTrue(l0.weight._grad_ivar() is not None)
 
+    def test_paddle_imperative_set_grad_enabled(self):
+        data = np.array([[2, 3], [4, 5]]).astype('float32')
+        with fluid.dygraph.guard():
+            l0 = fluid.Linear(2, 2)
+            self.assertTrue(l0.weight._grad_ivar() is None)
+            l1 = fluid.Linear(2, 2)
+            with paddle.set_grad_enabled(False):
+                self.assertTrue(l1.weight.stop_gradient is False)
+                tmp = l1.weight * 2
+                with paddle.set_grad_enabled(True):
+                    tmp2 = l1.weight * 2
+                self.assertTrue(tmp.stop_gradient)
+                self.assertTrue(tmp2.stop_gradient is False)
+            x = fluid.dygraph.to_variable(data)
+            y = l0(x) + tmp2
+            o = l1(y)
+            o.backward()
+
+            self.assertTrue(tmp._grad_ivar() is None)
+            self.assertTrue(tmp2._grad_ivar() is not None)
+            self.assertTrue(l0.weight._grad_ivar() is not None)
+
     def test_sum_op(self):
         x = np.ones([2, 2], np.float32)
         with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 3d1b08186384c..237ff0c958e39 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -137,9 +137,9 @@ def test_uniform_initializer_random_seed(self):
                 name="param2",
                 initializer=initializer.UniformInitializer(seed=456))
         init_op = block.ops[1]
-        self.assertEqual(init_op.attr("seed"), 123)
+        self.assertEqual(init_op.attr("seed"), 456)
         init_op1 = block.ops[0]
-        self.assertEqual(init_op1.attr("seed"), 456)
+        self.assertEqual(init_op1.attr("seed"), 123)
 
     def test_uniform_initializer(self, dtype="float32"):
         """Test uniform initializer with supplied attributes
@@ -594,12 +594,12 @@ def test_set_global_weight_initilizer(self):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'fill_constant')
         self.assertAlmostEqual(bias_init_op.attr('value'), 0.0, delta=DELTA)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -624,14 +624,14 @@ def test_set_global_bias_initilizer(self):
         block = startup_prog.global_block()
         self.assertEqual(len(block.ops), 2)
 
-        # init bias is the first op, and weight is the second
-        bias_init_op = block.ops[0]
+        # init weight is the first op, and bias is the second
+        bias_init_op = block.ops[1]
         self.assertEqual(bias_init_op.type, 'gaussian_random')
         self.assertAlmostEqual(bias_init_op.attr('mean'), 0.0, delta=DELTA)
         self.assertAlmostEqual(bias_init_op.attr('std'), 2.0, delta=DELTA)
         self.assertEqual(bias_init_op.attr('seed'), 0)
 
-        param_init_op = block.ops[1]
+        param_init_op = block.ops[0]
         self.assertEqual(param_init_op.type, 'uniform_random')
         self.assertAlmostEqual(param_init_op.attr('min'), -0.5, delta=DELTA)
         self.assertAlmostEqual(param_init_op.attr('max'), 0.5, delta=DELTA)
@@ -665,5 +665,49 @@ def test_uniform_initializer(self, dtype="float32"):
         paddle.enable_static()
 
 
+class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase):
+    def test_order(self):
+        paddle.set_device('cpu')
+        SEED = 123
+        weight_attr = paddle.framework.ParamAttr(
+            name="linear_weight",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+        bias_attr = paddle.framework.ParamAttr(
+            name="linear_bias",
+            learning_rate=1.0,
+            trainable=False,
+            regularizer=None,
+            initializer=paddle.nn.initializer.TruncatedNormal(
+                mean=0.0, std=2.0))
+
+        def run_dynamic_graph():
+            paddle.disable_static()
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            return linear.weight.numpy(), linear.bias.numpy()
+            paddle.enable_static()
+
+        def run_static_graph():
+            paddle.enable_static()
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            paddle.seed(SEED)
+            linear = paddle.nn.Linear(
+                1, 1, weight_attr=weight_attr, bias_attr=bias_attr)
+            res = exe.run(paddle.static.default_startup_program(),
+                          fetch_list=['linear_weight', 'linear_bias'])
+            return res[0], res[1]
+
+        dynamic_res = run_dynamic_graph()
+        static_res = run_static_graph()
+
+        self.assertTrue(np.array_equal(dynamic_res[0], static_res[0]))
+        self.assertTrue(np.array_equal(dynamic_res[1], static_res[1]))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ffecec1815b15..31704ebcd9192 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -164,10 +164,10 @@ def test_vanilla_momentum_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
     def test_nesterov_momentum_optimizer(self):
         init_program = framework.Program()
@@ -217,10 +217,10 @@ def test_nesterov_momentum_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdagradOptimizer(unittest.TestCase):
@@ -277,10 +277,10 @@ def test_adagrad_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestAdamOptimizer(unittest.TestCase):
@@ -344,8 +344,8 @@ def test_adam_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 5)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestAdamaxOptimizer(unittest.TestCase):
@@ -409,8 +409,8 @@ def test_adamax_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 4)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestDpsgdOptimizer(unittest.TestCase):
@@ -509,10 +509,10 @@ def test_decayed_adagrad_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 2)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
         self.assertEqual(init_ops[1].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+        self.assertAlmostEqual(init_ops[1].attr('value'), learning_rate)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), 0.0)
 
 
 class TestFtrlOptimizer(unittest.TestCase):
@@ -576,8 +576,8 @@ def test_ftrl_optimizer(self):
         # Check init_program
         init_ops = init_program.global_block().ops
         self.assertEqual(len(init_ops), 3)
-        self.assertEqual(init_ops[0].type, "fill_constant")
-        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[-1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[-1].attr('value'), learning_rate)
 
 
 class TestLookaheadOptimizer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
new file mode 100644
index 0000000000000..8fdedce22469a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_optimizer_for_varbase.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+import paddle
+import paddle.optimizer as optimizer
+
+
+class TestOptimizerForVarBase(unittest.TestCase):
+    def setUp(self):
+        self.lr = 0.01
+
+    def run_optimizer_step_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(
+            learning_rate=self.lr, parameters=[x], weight_decay=0.01)
+
+        z.backward()
+        opt.step()
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def run_optimizer_minimize_with_varbase_list_input(self, optimizer):
+        x = paddle.zeros([2, 3])
+        y = paddle.ones([2, 3])
+        x.stop_gradient = False
+
+        z = x + y
+
+        opt = optimizer(learning_rate=self.lr, parameters=[x])
+
+        z.backward()
+        opt.minimize(z)
+
+        self.assertTrue(np.allclose(x.numpy(), np.full([2, 3], -self.lr)))
+
+    def test_adam_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adam)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adam)
+
+    def test_sgd_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.SGD)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.SGD)
+
+    def test_adagrad_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adagrad)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adagrad)
+
+    def test_adamw_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.AdamW)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.AdamW)
+
+    def test_adamax_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Adamax)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Adamax)
+
+    def test_momentum_with_varbase_list_input(self):
+        self.run_optimizer_step_with_varbase_list_input(optimizer.Momentum)
+        self.run_optimizer_minimize_with_varbase_list_input(optimizer.Momentum)
+
+    def test_optimizer_with_varbase_input(self):
+        x = paddle.zeros([2, 3])
+        with self.assertRaises(TypeError):
+            optimizer.Adam(learning_rate=self.lr, parameters=x)
+
+    def test_create_param_lr_with_1_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 1.0})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+    def test_create_param_lr_with_no_1_value_for_coverage(self):
+        x = paddle.fluid.framework.ParamBase(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="x",
+            optimize_attr={'learning_rate': 0.12})
+        x.value().get_tensor().set(
+            np.random.random((5, 10)).astype('float32'),
+            paddle.fluid.framework._current_expected_place())
+
+        y = paddle.ones([5, 10])
+        z = x + y
+        opt = optimizer.Adam(learning_rate=self.lr, parameters=[x])
+        z.backward()
+        opt.step()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index f1001bfe9ccb7..3a5c43b2bab3e 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -27,6 +27,7 @@
 from paddle.fluid.optimizer import Adam
 import paddle.fluid.framework as framework
 from test_imperative_base import new_program_scope
+from paddle.optimizer.lr import LRScheduler
 
 BATCH_SIZE = 16
 BATCH_NUM = 4
@@ -262,8 +263,31 @@ def test_replace_static_save_load(self):
 
     def test_paddle_save_load_v2(self):
         paddle.disable_static()
+
+        class StepDecay(LRScheduler):
+            def __init__(self,
+                         learning_rate,
+                         step_size,
+                         gamma=0.1,
+                         last_epoch=-1,
+                         verbose=False):
+                self.step_size = step_size
+                self.gamma = gamma
+                super(StepDecay, self).__init__(learning_rate, last_epoch,
+                                                verbose)
+
+            def get_lr(self):
+                i = self.last_epoch // self.step_size
+                return self.base_lr * (self.gamma**i)
+
         layer = LinearNet()
-        state_dict = layer.state_dict()
+        inps = paddle.randn([2, IMAGE_SIZE])
+        adam = opt.Adam(
+            learning_rate=StepDecay(0.1, 1), parameters=layer.parameters())
+        y = layer(inps)
+        y.mean().backward()
+        adam.step()
+        state_dict = adam.state_dict()
         path = 'paddle_save_load_v2/model.pdparams'
         with self.assertRaises(TypeError):
             paddle.save(state_dict, path, use_binary_format='False')
@@ -274,9 +298,15 @@ def test_paddle_save_load_v2(self):
         paddle.save(state_dict, path)
         load_dict_np = paddle.framework.io._legacy_load(path)
         for k, v in state_dict.items():
-            self.assertTrue(
-                np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
-            self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
+            if isinstance(v, dict):
+                self.assertTrue(v == load_dict_tensor[k])
+            else:
+                self.assertTrue(
+                    np.array_equal(v.numpy(), load_dict_tensor[k].numpy()))
+                if not np.array_equal(v.numpy(), load_dict_np[k]):
+                    print(v.numpy())
+                    print(load_dict_np[k])
+                self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k]))
 
     def test_single_pickle_var_dygraph(self):
         # enable dygraph mode
@@ -370,6 +400,366 @@ def test_dygraph_save_static_load(self):
                     np.array_equal(tensor.numpy(),
                                    np.array(state_dict_param[tensor.name])))
 
+    def test_save_load_complex_object_dygraph_save(self):
+        paddle.disable_static()
+        layer = paddle.nn.Linear(3, 4)
+        state_dict = layer.state_dict()
+        obj1 = [
+            paddle.randn(
+                [3, 4], dtype='float32'), np.random.randn(5, 6),
+            ('fake_weight', np.ones(
+                [7, 8], dtype='float32'))
+        ]
+        obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+        obj3 = (paddle.randn(
+            [5, 4], dtype='float32'), np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+        obj4 = (np.random.randn(5, 6), (123, ))
+
+        path1 = "test_save_load_any_complex_object_dygraph/obj1"
+        path2 = "test_save_load_any_complex_object_dygraph/obj2"
+        path3 = "test_save_load_any_complex_object_dygraph/obj3"
+        path4 = "test_save_load_any_complex_object_dygraph/obj4"
+        paddle.save(obj1, path1)
+        paddle.save(obj2, path2)
+        paddle.save(obj3, path3)
+        paddle.save(obj4, path4)
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(load_tensor1[0].numpy(), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_tensor1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), load_tensor2['k2'][k].numpy()))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            np.array_equal(load_tensor3[0].numpy(), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                               v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_tensor3[2]["opt"][k].numpy(), v.numpy()))
+
+        self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+        # static mode
+        paddle.enable_static()
+
+        load_tensor1 = paddle.load(path1, return_numpy=False)
+        load_tensor2 = paddle.load(path2, return_numpy=False)
+        load_tensor3 = paddle.load(path3, return_numpy=False)
+        load_tensor4 = paddle.load(path4, return_numpy=False)
+
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor1[0]), obj1[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+        self.assertTrue(np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+
+        for i in range(len(load_tensor1)):
+            self.assertTrue(
+                type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(v.numpy(), np.array(load_tensor2['k2'][k])))
+        self.assertTrue(load_tensor2['epoch'] == 123)
+
+        self.assertTrue(
+            isinstance(load_tensor3[0], paddle.fluid.core.LoDTensor))
+        self.assertTrue(
+            np.array_equal(np.array(load_tensor3[0]), obj3[0].numpy()))
+        self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["state_dict"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(
+                    np.array(load_tensor3[2]["state_dict"][k]), v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                isinstance(load_tensor3[2]["opt"][k],
+                           paddle.fluid.core.LoDTensor))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor3[2]["opt"][k]), v.numpy()))
+
+        self.assertTrue(load_tensor4[0], paddle.fluid.core.LoDTensor)
+        self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+        load_array1 = paddle.load(path1, return_numpy=True)
+        load_array2 = paddle.load(path2, return_numpy=True)
+        load_array3 = paddle.load(path3, return_numpy=True)
+        load_array4 = paddle.load(path4, return_numpy=True)
+
+        self.assertTrue(np.array_equal(load_array1[0], obj1[0].numpy()))
+        self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+        self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+        for i in range(len(load_array1)):
+            self.assertTrue(type(load_array1[i]) == type(load_array2['k1'][i]))
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(v.numpy(), load_array2['k2'][k]))
+        self.assertTrue(load_array2['epoch'] == 123)
+
+        self.assertTrue(isinstance(load_array3[0], np.ndarray))
+        self.assertTrue(np.array_equal(load_array3[0], obj3[0].numpy()))
+        self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+        for k, v in state_dict.items():
+            self.assertTrue(
+                np.array_equal(load_array3[2]["state_dict"][k], v.numpy()))
+
+        for k, v in state_dict.items():
+            self.assertTrue(np.array_equal(load_array3[2]["opt"][k], v.numpy()))
+
+        self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_save_load_complex_object_static_save(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            prog = paddle.static.default_main_program()
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+
+            state_dict = prog.state_dict()
+            keys = list(state_dict.keys())
+            obj1 = [
+                state_dict[keys[0]], np.random.randn(5, 6),
+                ('fake_weight', np.ones(
+                    [7, 8], dtype='float32'))
+            ]
+            obj2 = {'k1': obj1, 'k2': state_dict, 'epoch': 123}
+            obj3 = (state_dict[keys[0]], np.ndarray(
+                [3, 4], dtype="float32"), {
+                    "state_dict": state_dict,
+                    "opt": state_dict
+                })
+            obj4 = (np.ndarray([3, 4], dtype="float32"), )
+
+            path1 = "test_save_load_any_complex_object_static/obj1"
+            path2 = "test_save_load_any_complex_object_static/obj2"
+            path3 = "test_save_load_any_complex_object_static/obj3"
+            path4 = "test_save_load_any_complex_object_static/obj4"
+            paddle.save(obj1, path1)
+            paddle.save(obj2, path2)
+            paddle.save(obj3, path3)
+            paddle.save(obj4, path4)
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[2]), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[0]), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor3[1]), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["state_dict"][k]), np.array(
+                            v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.LoDTensor))
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(load_tensor3[2]["opt"][k]), np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.LoDTensor))
+            self.assertTrue(np.array_equal(np.array(load_tensor4[0]), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+            # dygraph mode
+            paddle.disable_static()
+
+            load_tensor1 = paddle.load(path1, return_numpy=False)
+            load_tensor2 = paddle.load(path2, return_numpy=False)
+            load_tensor3 = paddle.load(path3, return_numpy=False)
+            load_tensor4 = paddle.load(path4, return_numpy=False)
+
+            self.assertTrue(
+                np.array_equal(np.array(load_tensor1[0]), np.array(obj1[0])))
+            self.assertTrue(np.array_equal(np.array(load_tensor1[1]), obj1[1]))
+            self.assertTrue(np.array_equal(load_tensor1[2].numpy(), obj1[2][1]))
+            for i in range(len(load_tensor1)):
+                self.assertTrue(
+                    type(load_tensor1[i]) == type(load_tensor2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(
+                        np.array(v), np.array(load_tensor2['k2'][k])))
+            self.assertTrue(load_tensor2['epoch'] == 123)
+
+            self.assertTrue(isinstance(load_tensor3[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[0].numpy(), obj3[0]))
+            self.assertTrue(isinstance(load_tensor3[1], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor3[1].numpy(), obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["state_dict"][k],
+                               fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["state_dict"][k].numpy(),
+                                   np.array(v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    isinstance(load_tensor3[2]["opt"][k], fluid.core.VarBase))
+                self.assertTrue(
+                    np.array_equal(load_tensor3[2]["opt"][k].numpy(),
+                                   np.array(v)))
+
+            self.assertTrue(isinstance(load_tensor4[0], fluid.core.VarBase))
+            self.assertTrue(np.array_equal(load_tensor4[0].numpy(), obj4[0]))
+
+            load_array1 = paddle.load(path1, return_numpy=True)
+            load_array2 = paddle.load(path2, return_numpy=True)
+            load_array3 = paddle.load(path3, return_numpy=True)
+            load_array4 = paddle.load(path4, return_numpy=True)
+
+            self.assertTrue(np.array_equal(load_array1[0], np.array(obj1[0])))
+            self.assertTrue(np.array_equal(load_array1[1], obj1[1]))
+            self.assertTrue(np.array_equal(load_array1[2], obj1[2][1]))
+            for i in range(len(load_array1)):
+                self.assertTrue(
+                    type(load_array1[i]) == type(load_array2['k1'][i]))
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(np.array(v), load_array2['k2'][k]))
+            self.assertTrue(load_array2['epoch'] == 123)
+
+            self.assertTrue(np.array_equal(load_array3[0], np.array(obj3[0])))
+            self.assertTrue(np.array_equal(load_array3[1], obj3[1]))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["state_dict"][k], np.array(
+                        v)))
+
+            for k, v in state_dict.items():
+                self.assertTrue(
+                    np.array_equal(load_array3[2]["opt"][k], np.array(v)))
+
+            self.assertTrue(isinstance(load_array4[0], np.ndarray))
+            self.assertTrue(np.array_equal(load_array4[0], obj4[0]))
+
+    def test_varbase_binary_var(self):
+        paddle.disable_static()
+        varbase = paddle.randn([3, 2], dtype='float32')
+        path = 'test_paddle_save_load_varbase_binary_var/varbase'
+        paddle.save(varbase, path, use_binary_format=True)
+        load_array = paddle.load(path, return_numpy=True)
+        load_tensor = paddle.load(path, return_numpy=False)
+        origin_array = varbase.numpy()
+        load_tensor_array = load_tensor.numpy()
+        if paddle.fluid.core.is_compiled_with_cuda():
+            fluid.core._cuda_synchronize(paddle.CUDAPlace(0))
+        self.assertTrue(np.array_equal(origin_array, load_array))
+        self.assertTrue(np.array_equal(origin_array, load_tensor_array))
+
 
 class TestSaveLoad(unittest.TestCase):
     def setUp(self):
@@ -431,8 +821,6 @@ def test_save_load(self):
         # error test cases, some tests relay base test above
         # 1. test save obj not dict error
         test_list = [1, 2, 3]
-        with self.assertRaises(NotImplementedError):
-            paddle.save(test_list, "not_dict_error_path")
 
         # 2. test save path format error
         with self.assertRaises(ValueError):
@@ -471,5 +859,33 @@ def test_save_load_program(self):
             self.assertTrue(origin_startup == load_startup)
 
 
+class TestSaveLoadLayer(unittest.TestCase):
+    def test_save_load_layer(self):
+        if six.PY2:
+            return
+
+        paddle.disable_static()
+        inps = paddle.randn([1, IMAGE_SIZE], dtype='float32')
+        layer1 = LinearNet()
+        layer2 = LinearNet()
+        layer1.eval()
+        layer2.eval()
+        origin = (layer1(inps), layer2(inps))
+        path = "test_save_load_layer_/layer.pdmodel"
+        paddle.save((layer1, layer2), path)
+
+        # static
+        paddle.enable_static()
+        with self.assertRaises(ValueError):
+            paddle.load(path)
+        # dygraph
+        paddle.disable_static()
+
+        loaded_layer = paddle.load(path)
+        loaded_result = [l(inps) for l in loaded_layer]
+        for i in range(len(origin)):
+            self.assertTrue((origin[i] - loaded_result[i]).abs().max() < 1e-10)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
new file mode 100644
index 0000000000000..8b508d5c9ae79
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load_binary.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import os
+import sys
+import six
+
+import paddle
+import paddle.nn as nn
+import paddle.optimizer as opt
+import paddle.fluid as fluid
+from paddle.fluid.optimizer import Adam
+import paddle.fluid.framework as framework
+from test_imperative_base import new_program_scope
+
+IMAGE_SIZE = 784
+
+
+class TestSaveLoadBinaryFormat(unittest.TestCase):
+    def setUp(self):
+        # enable static graph mode
+        paddle.enable_static()
+
+    def set_zero(self, prog, place, scope=None):
+        if scope is None:
+            scope = fluid.global_scope()
+        for var in prog.list_vars():
+            if isinstance(var, framework.Parameter) or var.persistable:
+                ten = scope.find_var(var.name).get_tensor()
+                if ten is not None:
+                    ten.set(np.zeros_like(np.array(ten)), place)
+                    new_t = np.array(scope.find_var(var.name).get_tensor())
+                    self.assertTrue(np.sum(np.abs(new_t)) == 0)
+
+    def replace_save_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        vars = filter(predicate, program.list_vars())
+        for var in vars:
+            paddle.save(
+                var.get_value(),
+                os.path.join(dirname, var.name),
+                use_binary_format=True)
+
+    def replace_load_vars(self, program, dirname):
+        def predicate(var):
+            return var.persistable
+
+        var_list = list(filter(predicate, program.list_vars()))
+        for var in var_list:
+            var_load = paddle.load(os.path.join(dirname, var.name))
+            # set var_load to scope
+            var.set_value(var_load)
+
+    def test_replace_save_load_vars(self):
+        paddle.enable_static()
+        with new_program_scope():
+            # create network
+            x = paddle.static.data(
+                name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            z = paddle.static.nn.fc(x, 10, bias_attr=False)
+            z = paddle.static.nn.fc(z, 128, bias_attr=False)
+            loss = fluid.layers.reduce_mean(z)
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = paddle.static.Executor(place)
+            exe.run(paddle.static.default_startup_program())
+            prog = paddle.static.default_main_program()
+            base_map = {}
+            for var in prog.list_vars():
+                if isinstance(var, framework.Parameter) or var.persistable:
+                    t = np.array(fluid.global_scope().find_var(var.name)
+                                 .get_tensor())
+                    # make sure all the paramerter or optimizer var have been update
+                    self.assertTrue(np.sum(np.abs(t)) != 0)
+                    base_map[var.name] = t
+            # test for replace_save_vars/io.load_vars
+            path_vars1 = 'test_replace_save_load_vars_binary1/model'
+            self.replace_save_vars(prog, path_vars1)
+            # set var to zero
+            self.set_zero(prog, place)
+            var_list = list(
+                filter(lambda var: var.persistable, prog.list_vars()))
+            fluid.io.load_vars(
+                exe, path_vars1, main_program=prog, vars=var_list)
+
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+            # test for io.save_vars/replace_load_vars
+            path_vars2 = 'test_replace_save_load_vars_binary2/model/'
+            fluid.io.save_vars(
+                exe, path_vars2, main_program=prog, vars=var_list)
+            self.set_zero(prog, place)
+            self.replace_load_vars(prog, path_vars2)
+            for var in prog.list_vars():
+                if var.persistable:
+                    new_t = np.array(fluid.global_scope().find_var(var.name)
+                                     .get_tensor())
+                    base_t = base_map[var.name]
+
+                    self.assertTrue(np.array_equal(new_t, base_t))
+
+    def test_save_load_lod_tensor(self):
+        paddle.enable_static()
+        OUTPUT_NUM = 32
+        with new_program_scope():
+            x = fluid.data(name="x", shape=[None, IMAGE_SIZE], dtype='float32')
+            y = fluid.layers.fc(
+                x,
+                OUTPUT_NUM,
+                name='fc_vars', )
+            prog = fluid.default_main_program()
+            place = fluid.CPUPlace(
+            ) if not paddle.fluid.core.is_compiled_with_cuda(
+            ) else fluid.CUDAPlace(0)
+            exe = fluid.Executor(place)
+            prog = paddle.static.default_main_program()
+            exe.run(fluid.default_startup_program())
+
+            dirname = 'test_save_load_lod_tensor1/tensor_'
+            for var in prog.list_vars():
+                if var.persistable and list(
+                        var.shape) == [IMAGE_SIZE, OUTPUT_NUM]:
+                    tensor = var.get_value()
+                    paddle.save(
+                        tensor, dirname + 'fc_vars.w_0', use_binary_format=True)
+                    break
+
+            origin = np.array(var.get_value())
+            var.set_value(np.zeros_like(origin))
+            is_zeros = np.array(var.get_value())
+
+            loaded_tensor = paddle.load(dirname + 'fc_vars.w_0')
+            self.assertTrue(isinstance(loaded_tensor, fluid.core.LoDTensor))
+            self.assertTrue(
+                list(loaded_tensor.shape()) == [IMAGE_SIZE, OUTPUT_NUM])
+            to_array = np.array(loaded_tensor)
+            self.assertTrue(np.array_equal(origin, to_array))
+
+        with self.assertRaises(NotImplementedError):
+            path = 'test_save_load_error/temp'
+            paddle.save({}, path, use_binary_format=True)
+
+        with self.assertRaises(ValueError):
+            path = 'test_save_load_error/temp'
+            with open(path, "w") as f:
+                f.write('\0')
+            paddle.load(path)
+
+        with self.assertRaises(ValueError):
+            temp_lod = fluid.core.LoDTensor()
+            paddle.save(temp_lod, path, use_binary_format=True)
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_lod_tensor(
+                temp_lod, 'test_save_load_error_not_exist_file/not_exist_file')
+
+    def test_save_load_selected_rows(self):
+        paddle.enable_static()
+        place = fluid.CPUPlace() if not paddle.fluid.core.is_compiled_with_cuda(
+        ) else fluid.CUDAPlace(0)
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = fluid.core.SelectedRows(rows, height)
+        path = 'test_paddle_save_load_selected_rows/sr.pdsr'
+
+        with self.assertRaises(ValueError):
+            paddle.save(selected_rows, path, use_binary_format=True)
+
+        np_array = np.random.randn(len(rows), row_numel).astype("float32")
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        paddle.save(selected_rows, path, use_binary_format=True)
+        load_sr = paddle.load(path)
+
+        self.assertTrue(isinstance(load_sr, fluid.core.SelectedRows))
+        self.assertTrue(list(load_sr.rows()) == rows)
+        self.assertTrue(load_sr.height() == height)
+        self.assertTrue(
+            np.array_equal(np.array(load_sr.get_tensor()), np_array))
+
+        with self.assertRaises(RuntimeError):
+            fluid.core._save_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
+        with self.assertRaises(RuntimeError):
+            fluid.core._load_selected_rows(
+                selected_rows,
+                'test_paddle_save_load_selected_rows_not_exist_file/temp')
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
index c3cb26c078e2d..4b9d6764bbb3b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_hybrid_parallel.py
@@ -21,15 +21,18 @@
 
 
 class TestHybridParallel(TestMultipleGpus):
-    def test_hybrid_parallel_mp_layers(self):
-        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
-
     def test_hybrid_parallel_mp_random(self):
         self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
 
     def test_hybrid_parallel_mp_model(self):
         self.run_mnist_2gpu('hybrid_parallel_mp_model.py')
 
+    def test_hybrid_parallel_mp_amp(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_amp.py')
+
+    def test_hybrid_parallel_mp_clip_grad(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_clip_grad.py')
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
new file mode 100644
index 0000000000000..e0a2770852b63
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestModelParallelLayer(TestMultipleGpus):
+    def test_hybrid_parallel_mp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
new file mode 100644
index 0000000000000..f3b89d694f70b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_layer.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestHybridPipeParallel(TestMultipleGpus):
+    def test_hybrid_parallel_pp_layer(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_layer.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
index ea59a7f584a2d..47d286fb6ab32 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
@@ -159,7 +159,7 @@ def check_network_convergence(self,
                 train_data = paddle.batch(
                     paddle.reader.shuffle(
                         paddle.dataset.conll05.test(), buf_size=8192),
-                    batch_size=16)
+                    batch_size=8)
 
                 place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
                 exe = fluid.Executor(place)
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
new file mode 100644
index 0000000000000..7f8294ad0efe7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestPipelineParallel(TestMultipleGpus):
+    def test_pipeline_parallel(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_model.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pylayer_op.py b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
index 89f8330fe5ba4..f00db0b369353 100644
--- a/python/paddle/fluid/tests/unittests/test_pylayer_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pylayer_op.py
@@ -52,6 +52,40 @@ def backward(ctx, dy1, dy2):
 
         self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
 
+    def test_simple_pylayer_return_none_with_no_grad(self):
+        class tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2, func1, func2=paddle.square):
+                ctx.func = func2
+                y1 = func1(x1)
+                y2 = func1(x2)
+                ctx.save_for_backward(y1, y2)
+                return y1, y2
+
+            @staticmethod
+            def backward(ctx, dy1, dy2):
+                y1, y2 = ctx.saved_tensor()
+                re1 = dy1 * (1 - ctx.func(y1))
+                re2 = dy2 * (1 - paddle.square(y2))
+                return re1, None
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = input1.detach().clone()
+        input3 = input1.detach().clone()
+        input4 = input1.detach().clone()
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+        input3.stop_gradient = True
+        input4.stop_gradient = True
+        z = tanh.apply(input1, input3, paddle.tanh, paddle.square)
+        z = z[0] + z[1]
+        z.mean().backward()
+
+        z2 = paddle.tanh(input2) + paddle.tanh(input4)
+        z2.mean().backward()
+
+        self.assertTrue(np.max(np.abs((input1.grad - input2.grad))) < 1e-10)
+
     def test_simple_pylayer_single_output(self):
         class tanh(PyLayer):
             @staticmethod
@@ -196,7 +230,7 @@ def backward(ctx, dy1):
         input2.stop_gradient = False
         z = Layer_bk_none1.apply(input2)
 
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             with paddle.fluid.dygraph.guard():
                 z.sum().backward()
 
@@ -212,7 +246,7 @@ def backward(ctx, dy1):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_none2.apply(input1, input1)
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             with paddle.fluid.dygraph.guard():
                 z.mean().backward()
 
@@ -228,14 +262,14 @@ def backward(ctx, dy):
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
         z = Layer_bk_one1.apply(input1)
-        with self.assertRaises(NotImplementedError):
+        with self.assertRaises(ValueError):
             with paddle.fluid.dygraph.guard():
                 z.mean().backward()
 
         class Layer_bk_one2(PyLayer):
             @staticmethod
-            def forward(ctx, x):
-                return x * 2, x * 5
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
 
             @staticmethod
             def backward(ctx, *args):
@@ -243,8 +277,9 @@ def backward(ctx, *args):
 
         input1 = paddle.randn([2, 3]).astype("float64")
         input1.stop_gradient = False
-        z = Layer_bk_one1.apply(input1)
-        with self.assertRaises(NotImplementedError):
+        y = Layer_bk_one2.apply(input1, input1)
+        z = y[0] + y[1]
+        with self.assertRaises(ValueError):
             with paddle.fluid.dygraph.guard():
                 z.mean().backward()
 
@@ -279,24 +314,97 @@ def backward(ctx, dy1, dy2):
                 z = z[0] + z[1]
                 z.mean().backward()
 
+    def test_pylayer_bk_return_none(self):
+        class Layer_bk_none1(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 + x2
+
+            @staticmethod
+            def backward(ctx, dy):
+                return 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none1.apply(input1, input2)
+
+        with self.assertRaises(ValueError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
+        class Layer_bk_none2(PyLayer):
+            @staticmethod
+            def forward(ctx, x1, x2):
+                return x1 * 2, x2 * 5
+
+            @staticmethod
+            def backward(ctx, *args):
+                return 1, 1
+
+        input1 = paddle.randn([2, 3]).astype("float64")
+        input2 = paddle.randn([2, 3]).astype("float64")
+        input1.stop_gradient = True
+        input2.stop_gradient = False
+        z = Layer_bk_none2.apply(input1, input2)
+        z = z[0] + z[1]
+        with self.assertRaises(ValueError):
+            with paddle.fluid.dygraph.guard():
+                z.mean().backward()
+
     def test_pylayer_inplace(self):
         class cus_tanh(PyLayer):
             @staticmethod
             def forward(ctx, x):
-                return x.mean()
+                return x
 
             @staticmethod
             def backward(ctx, dy):
                 return dy
 
+        class Layer(paddle.nn.Layer):
+            def __init__(self):
+                super(Layer, self).__init__()
+
+            def forward(self, data):
+                data = paddle.nn.functional.relu(data)
+                z = paddle.tanh(data)
+                z = cus_tanh.apply(data)
+                return z.mean()
+
         for i in range(2):
             data = paddle.ones([2, 3], dtype="float64") / (i + 1)
             data.stop_gradient = False
+            layer = Layer()
+            z = layer(data)
+            z.backward()
+            self.assertTrue(data.grad is not None)
+
+    def test_backward_in_backward(self):
+        class cus_tanh(PyLayer):
+            @staticmethod
+            def forward(ctx, x):
+                temp = x.detach()
+                ctx.inputs = temp
+                return x.mean()
+
+            @staticmethod
+            def backward(ctx, dy):
+                with paddle.set_grad_enabled(True):
+                    temp = ctx.inputs
+                    temp.stop_gradient = False
+                    z = paddle.tanh(temp)
+                    z.backward()
+                    self.assertTrue(temp.grad is not None)
+                    return paddle.to_tensor(temp.grad)
+
+        for i in range(2):
+            data = paddle.ones([2, 3], dtype="float32") / (i + 1)
+            data.stop_gradient = False
             data = paddle.nn.functional.relu(data)
             z = paddle.tanh(data)
             z = cus_tanh.apply(data)
-            z.backward()
-            self.assertTrue(data.grad is not None)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
index 35bb4487c6aae..59d1ede5a0b53 100644
--- a/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_nd_op.py
@@ -242,7 +242,7 @@ def check_raise_is_test():
                 output5 = fluid.layers.scatter_nd_add(ref5, index5, updates5)
             except Exception as e:
                 t = \
-                "Input(Index).shape[-1] should be no greater than Input(X).rank"
+                "The last dimension of Input(Index)'s shape should be no greater "
                 if t in str(e):
                     raise IndexError
 
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 0885891cdbe02..9534e4fe95416 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -775,5 +775,76 @@ def test_error(self):
         self._broadcast_mismatch()
 
 
+# 5. Test backward
+
+
+class Model(paddle.nn.Layer):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.conv = paddle.nn.Conv2D(12, 12, 3)
+
+    def forward(self, x, y):
+        x = self.conv(x)
+        y = self.conv(y)
+        var = y.flatten()
+
+        x[0, :, 0, 0] = var
+        loss = paddle.mean(x)
+        return loss, var, x
+
+
+class TestBackward(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+
+        x_np = np.random.random(size=(4, 4)).astype('float32')
+        y_np = np.random.random(size=(4, 4)).astype('float32')
+        label_np = np.random.randint(2, size=(4, 1)).astype('int64')
+
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
+            y = paddle.static.data(name="y", shape=[4, 4], dtype='float32')
+
+            label = paddle.static.data(
+                name="label", shape=[4, 1], dtype='int64')
+
+            z = paddle.add(x, y)
+            var = y[0, :]
+            z[0, :] = var
+
+            prediction = paddle.static.nn.fc(x=z, size=2, activation='softmax')
+
+            cost = paddle.nn.functional.cross_entropy(
+                input=prediction, label=label)
+            loss = paddle.mean(cost)
+            sgd = paddle.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        exe.run(startup_program)
+
+        var_grad, z_grad = exe.run(
+            main_program,
+            feed={"x": x_np,
+                  "y": y_np,
+                  "label": label_np},
+            fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
+
+        self.assertTrue((var_grad == z_grad[0, :]).all())
+
+    def test_dynamic(self):
+        paddle.disable_static()
+        model = Model()
+        x = paddle.ones([1, 12, 3, 3]).astype("float32")
+        y = paddle.ones([1, 12, 3, 3]).astype("float32")
+        loss, var, x = model(x, y)
+        loss.backward()
+
+        self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
+        self.assertTrue((var.grad == x.grad[0, :, 0, 0]).all())
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
new file mode 100644
index 0000000000000..6f2f7408262d9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+import os
+import paddle
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestStaticModelParallel(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl_comm_num = 1
+        self._pipeline_mode = True
+
+    def test_dist_static_model_parallel(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_row.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel2(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_by_col.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+    def test_dist_static_model_parallel3(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "static_model_parallel_embedding.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_to_list.py b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
new file mode 100644
index 0000000000000..73b91297e6fd6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_to_list.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import unittest
+import numpy as np
+import six
+import paddle
+
+
+class TensorToListTest(unittest.TestCase):
+    def setUp(self):
+        self.shape = [11, 25, 32, 43]
+
+    def test_tensor_tolist(self):
+        places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+            places.append(fluid.CUDAPinnedPlace())
+
+        for p in places:
+            np_arr = np.reshape(
+                np.array(six.moves.range(np.prod(self.shape))), self.shape)
+            expectlist = np_arr.tolist()
+
+            t = paddle.to_tensor(np_arr, place=p)
+            tensorlist = t.tolist()
+
+            self.assertEqual(tensorlist, expectlist)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 76c871f37216b..7901df7917121 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -473,6 +473,70 @@ def _test_slice(self):
             np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
         self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
 
+    def _test_slice_for_tensor_attr(self):
+        tensor_array = np.array(
+            [[[1, 2, 3], [4, 5, 6], [7, 8, 9]],
+             [[10, 11, 12], [13, 14, 15], [16, 17, 18]],
+             [[19, 20, 21], [22, 23, 24], [25, 26, 27]]]).astype('float32')
+
+        var = paddle.to_tensor(tensor_array)
+
+        one = paddle.ones(shape=[1], dtype="int32")
+        two = paddle.full(shape=[1], fill_value=2, dtype="int32")
+        negative_one = paddle.full(shape=[1], fill_value=-1, dtype="int32")
+        four = paddle.full(shape=[1], fill_value=4, dtype="int32")
+
+        var = fluid.dygraph.to_variable(tensor_array)
+        var1 = var[0, one, one]
+        var2 = var[one:]
+        var3 = var[0:one]
+        var4 = var[::negative_one]
+        var5 = var[one, one:, one:]
+        var_reshape = fluid.layers.reshape(var, [3, negative_one, 3])
+        var6 = var_reshape[:, :, negative_one]
+        var7 = var[:, :, :negative_one]
+        var8 = var[:one, :one, :1]
+        var9 = var[:-1, :negative_one, :negative_one]
+        var10 = var[::negative_one, :one, :negative_one]
+        var11 = var[:negative_one, ::-1, negative_one:]
+        var12 = var[one:2, 2:, ::negative_one]
+        var13 = var[two:10, 2:, -2:negative_one]
+        var14 = var[1:negative_one, 0:2, ::negative_one]
+        var15 = var[::negative_one, ::-1, ::negative_one]
+        var16 = var[-4:4]
+
+        vars = [
+            var, var1, var2, var3, var4, var5, var6, var7, var8, var9, var10,
+            var11, var12, var13, var14, var15, var16
+        ]
+        local_out = [var.numpy() for var in vars]
+
+        self.assertTrue(np.array_equal(local_out[1], tensor_array[0, 1, 1:2]))
+        self.assertTrue(np.array_equal(local_out[2], tensor_array[1:]))
+        self.assertTrue(np.array_equal(local_out[3], tensor_array[0:1]))
+        self.assertTrue(np.array_equal(local_out[4], tensor_array[::-1]))
+        self.assertTrue(np.array_equal(local_out[5], tensor_array[1, 1:, 1:]))
+        self.assertTrue(
+            np.array_equal(local_out[6],
+                           tensor_array.reshape((3, -1, 3))[:, :, -1]))
+        self.assertTrue(np.array_equal(local_out[7], tensor_array[:, :, :-1]))
+        self.assertTrue(np.array_equal(local_out[8], tensor_array[:1, :1, :1]))
+        self.assertTrue(
+            np.array_equal(local_out[9], tensor_array[:-1, :-1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[10], tensor_array[::-1, :1, :-1]))
+        self.assertTrue(
+            np.array_equal(local_out[11], tensor_array[:-1, ::-1, -1:]))
+        self.assertTrue(
+            np.array_equal(local_out[12], tensor_array[1:2, 2:, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[13], tensor_array[2:10, 2:, -2:-1]))
+        self.assertTrue(
+            np.array_equal(local_out[14], tensor_array[1:-1, 0:2, ::-1]))
+        self.assertTrue(
+            np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
+        self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
+
     def _test_for_var(self):
         np_value = np.random.random((30, 100, 100)).astype('float32')
         w = fluid.dygraph.to_variable(np_value)
@@ -483,6 +547,7 @@ def _test_for_var(self):
     def test_slice(self):
         with fluid.dygraph.guard():
             self._test_slice()
+            self._test_slice_for_tensor_attr()
             self._test_for_var()
 
             var = fluid.dygraph.to_variable(self.array)
@@ -631,6 +696,18 @@ def test_tensor_str_scaler(self):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str_shape_with_zero(self):
+        paddle.disable_static(paddle.CPUPlace())
+        x = paddle.ones((10, 10))
+        y = paddle.fluid.layers.where(x == 0)
+        a_str = str(y)
+
+        expected = '''Tensor(shape=[0, 2], dtype=int64, place=CPUPlace, stop_gradient=True,
+       [])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
     def test_print_tensor_dtype(self):
         paddle.disable_static(paddle.CPUPlace())
         a = paddle.rand([1])
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
index 110e7bb3cbf41..3eefa0bce8863 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -85,11 +85,8 @@ def test_errors(self):
             # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
             x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
             self.assertRaises(TypeError, fluid.layers.assign, x3)
-            # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='float64')
+            x4 = np.array([[2.5, 2.5]], dtype='uint8')
             self.assertRaises(TypeError, fluid.layers.assign, x4)
-            x5 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x5)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
new file mode 100755
index 0000000000000..a27d806319cb2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+sys.path.append("..")
+from op_test_xpu import XPUOpTest
+sys.path.append("../rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(XPUOpTest):
+    def init_size(self):
+        self.seq_length = 1
+        self.batch_size = 1
+        self.input_size = 5
+        self.hidden_size = 16
+
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.init_size()
+        self.op_type = "rnn"
+        self.dtype = np.float32
+        self.sequence_length = np.ones(
+            (self.batch_size, ), dtype=np.int32) * self.seq_length
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.dropout = 0.0
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1,
+            high=0.1,
+            size=(self.seq_length, self.batch_size,
+                  self.input_size)).astype(self.dtype)
+
+        rnn1 = LSTM(
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            dtype="float32")
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(
+                place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_test:
+                var_name_list = self.get_weight_names()
+                grad_check_list = ['Input', 'init_h', 'init_c']
+                grad_check_list.extend(var_name_list)
+                self.check_grad_with_place(
+                    place,
+                    set(grad_check_list), ['Out', 'last_hidden', 'last_cell'],
+                    max_relative_error=0.1)
+
+
+class TestRNNOpCase0(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 2
+        self.batch_size = 4
+        self.input_size = 10
+        self.hidden_size = 32
+
+
+class TestRNNOpCase1(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 5
+        self.batch_size = 16
+        self.input_size = 30
+        self.hidden_size = 64
+
+
+class TestRNNOpCase2(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 10
+        self.batch_size = 64
+        self.input_size = 50
+        self.hidden_size = 64
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py
index 752ec0672c216..ef6975c3d241e 100644
--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -17,6 +17,7 @@
 import sys
 import math
 from functools import reduce
+import os
 
 import collections
 import six
@@ -101,34 +102,64 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank,
         nranks = len(endpoints)
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
+        block = program.global_block()
+
         if rank == 0 and wait_port:
             wait_server_ready(other_endpoints)
 
         block = program.global_block()
-        nccl_id_var = block.create_var(
-            name=unique_name.generate('nccl_id'),
-            persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(
-            type='c_gen_nccl_id',
-            inputs={},
-            outputs={'Out': nccl_id_var},
-            attrs={
-                'rank': rank,
-                'endpoint': current_endpoint,
-                'other_endpoints': other_endpoints,
-                self.op_role_key: OpRole.Forward
-            })
-        block.append_op(
-            type='c_comm_init',
-            inputs={'X': nccl_id_var},
-            outputs={},
-            attrs={
-                'nranks': nranks,
-                'rank': rank,
-                'ring_id': ring_id,
-                self.op_role_key: OpRole.Forward
-            })
+        if core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+            block.append_op(
+                type='c_gen_hccl_id',
+                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
+                outputs={},
+                attrs={
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    'device_id': int(os.getenv("FLAGS_selected_npus")),
+                    'rank_ids': nranks,
+                    self.op_role_key: OpRole.Forward
+                })
+        else:
+            nccl_id_var = block.create_var(
+                name=unique_name.generate('nccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            block.append_op(
+                type='c_gen_nccl_id',
+                inputs={},
+                outputs={'Out': nccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init',
+                inputs={'X': nccl_id_var},
+                outputs={},
+                attrs={
+                    'nranks': nranks,
+                    'rank': rank,
+                    'ring_id': ring_id,
+                    self.op_role_key: OpRole.Forward
+                })
 
     def _broadcast_params(self):
         block = self.startup_program.global_block()
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 5a616d81659b2..b8684874085a9 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -18,12 +18,16 @@
     'NPUPlace', 'get_default_dtype', 'set_default_dtype'
 ]
 
-__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
+__all__ += [
+    'grad', 'set_grad_enabled', 'LayerList', 'load', 'save', 'no_grad',
+    'DataParallel'
+]
 
 from . import random
 from .random import seed
 from .framework import get_default_dtype
 from .framework import set_default_dtype
+from .framework import set_grad_enabled
 
 from ..fluid.param_attr import ParamAttr  #DEFINE_ALIAS
 # from ..fluid.layers.tensor import create_global_var  #DEFINE_ALIAS
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41ec18ce32d30..77be85a3195fd 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -15,7 +15,9 @@
 # TODO: define framework api 
 from paddle.fluid.layer_helper_base import LayerHelperBase
 from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.framework import _dygraph_tracer
 import numpy as np
+from contextlib import contextmanager
 
 __all__ = ['set_default_dtype', 'get_default_dtype']
 
@@ -80,3 +82,37 @@ def get_default_dtype():
             paddle.get_default_dtype()
     """
     return LayerHelperBase.get_default_dtype()
+
+
+@contextmanager
+def set_grad_enabled(mode):
+    """
+    :api_attr: imperative
+
+    Create a context which enables or disables dygraph gradient calculation.
+
+    Args:
+        mode(bool): whether to enable (`True`), or disable (`False`) grad.
+
+    Examples:
+        .. code-block:: python
+            x = paddle.ones([3, 2])
+            x.stop_gradient = False
+            with torch.set_grad_enabled(False):
+                y = x * 2
+                with torch.set_grad_enabled(True):
+                    z = x * 2
+            print(y.stop_gradient)   # True
+            print(z.stop_gradient)   # False
+    """
+
+    tracer = _dygraph_tracer()
+    if tracer:
+        prev_mode = tracer._has_grad
+        tracer._has_grad = mode
+        try:
+            yield
+        finally:
+            tracer._has_grad = prev_mode
+    else:
+        yield
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 4df84c12ad928..32a62d2461a14 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -235,11 +235,6 @@ def _pickle_save(obj, f, protocol):
         raise ValueError("Expected 1<'protocol'<5, but received protocol={}".
                          format(protocol))
 
-    if not isinstance(obj, (core.LoDTensor, core.VarBase)):
-        raise NotImplementedError(
-            "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.".
-            format(type(obj)))
-
     def reudce_varbase(self):
         data = self.numpy()
         name = self.name
@@ -287,11 +282,48 @@ def pop_dispatch_table():
             pickler.dump(obj)
 
 
-def _use_legacy(obj):
-    # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
-    if not isinstance(obj, dict):
+def _contain_x(obj, condition_func):
+    if isinstance(obj, core.SelectedRows):
+        raise NotImplementedError(
+            "`paddle.save` do not support saving 'SelectedRows'.")
+
+    if condition_func(obj):
+        return True
+    elif type(obj) in (dict, collections.OrderedDict, list, tuple):
+        if type(obj) in (dict, collections.OrderedDict):
+            keys = list(obj.keys())
+        else:
+            keys = range(len(obj))
+        flag = False
+        for key in keys:
+            flag |= _contain_x(obj[key], condition_func)
+            if flag:
+                return True
+        return flag
+    else:
         return False
-    return True
+
+
+def _is_state_dict(obj):
+    if isinstance(obj, dict):
+
+        def condition(obj):
+            return isinstance(obj, (core.Layer, Program, core.VarBase,
+                                    core.LoDTensor, core.SelectedRows))
+
+        # If the value of a dict is a core.VarBase/LoDTensor or a dict 
+        # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), 
+        # the dict is considered to be a state_ dict.
+        for key, value in obj.items():
+            if isinstance(value, dict):
+                for k, v in value.items():
+                    if _contain_x(v, condition):
+                        return False
+            elif not isinstance(value, (core.VarBase, core.LoDTensor)):
+                return False
+        return True
+
+    return False
 
 
 def _transformed_from_varbase(obj):
@@ -348,6 +380,120 @@ def _ndarray_to_tensor(obj, return_numpy):
         return _to_LodTensor(obj)
 
 
+def _lod_tensor2varbase(tensor):
+    return_var = _varbase_creator()
+    return_var.value().get_tensor().set(tensor, _current_expected_place())
+    return return_var
+
+
+def _parse_every_object(obj, condition_func, convert_func):
+    if condition_func(obj):
+        return convert_func(obj)
+    elif type(obj) in (dict, collections.OrderedDict, list):
+        if type(obj) == list:
+            keys = range(len(obj))
+        else:
+            keys = list(obj.keys())
+        for key in keys:
+            if condition_func(obj[key]):
+                obj[key] = convert_func(obj[key])
+            else:
+                obj[key] = _parse_every_object(obj[key], condition_func,
+                                               convert_func)
+        return obj
+    elif type(obj) == tuple:
+        return tuple(
+            _parse_every_object(list(obj), condition_func, convert_func))
+    elif type(obj) == set:
+        return set(_parse_every_object(list(obj), condition_func, convert_func))
+    else:
+        if isinstance(obj, collections.Iterable) and not isinstance(obj, (
+                str, np.ndarray, core.VarBase, core.LoDTensor)):
+            raise NotImplementedError(
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".
+                format(type(obj)))
+        return obj
+
+
+def _parse_load_result(obj, return_numpy):
+    def is_layer(obj):
+        return isinstance(obj, core.Layer)
+
+    def parse_layer(obj):
+        temp_dict = _parse_load_result(obj.__dict__, False)
+        obj.__dict__.update(temp_dict)
+        return obj
+
+    if _contain_x(obj, is_layer):
+        if not in_dygraph_mode():
+            raise ValueError(
+                "Layer can only be loaded in dynamic graph mode, but now in static graph mode."
+            )
+
+        _parse_every_object(obj, is_layer, parse_layer)
+
+    def tuple_to_tensor(obj):
+        return _tuple_to_tensor(obj, return_numpy=return_numpy)
+
+    def ndarray_to_tensor(obj):
+        return _ndarray_to_tensor(obj, return_numpy=return_numpy)
+
+    # tuple(name, ndarry) was converted from varbase of paddle2.1, 
+    # and all tuple(name, ndarry) are converted to tensor.
+    if _contain_x(obj, _transformed_from_varbase):
+        return _parse_every_object(obj, _transformed_from_varbase,
+                                   tuple_to_tensor)
+    # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0 
+    # or converted from LoDTensor, and all ndarrays are converted to tensor.
+    else:
+        return _parse_every_object(obj, _transformed_from_lodtensor,
+                                   ndarray_to_tensor)
+
+
+def _save_lod_tensor(tensor, file_name):
+    if not tensor._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_lod_tensor(tensor, file_name)
+    # '_seek' is the end position of this tensor in the file.
+    return _seek
+
+
+def _load_lod_tensor(file_name):
+    temp_t = paddle.fluid.core.LoDTensor()
+    # '_seek' is the end position of this tensor in the file.
+    _seek = paddle.fluid.core._load_lod_tensor(temp_t, file_name)
+    return temp_t, _seek
+
+
+def _save_selected_rows(selected_rows, file_name):
+    # '_seek' is the end position of this SelectedRows in the file.
+    if not selected_rows.get_tensor()._is_initialized():
+        raise ValueError("The saved tensor is not initialized.")
+    _seek = core._save_selected_rows(selected_rows, file_name)
+    return _seek
+
+
+def _load_selected_rows(file_name):
+    temp_sr = core.SelectedRows()
+    # '_seek' is the end position of this SelectedRows in the file.
+    _seek = core._load_selected_rows(temp_sr, file_name)
+    return temp_sr, _seek
+
+
+def _save_binary_var(obj, path):
+    if isinstance(obj, core.LoDTensor):
+        _save_lod_tensor(obj, path)
+    elif isinstance(obj, core.SelectedRows):
+        _save_selected_rows(obj, path)
+    elif isinstance(obj, core.VarBase):
+        _save_lod_tensor(obj.value().get_tensor(), path)
+    else:
+        # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
+        raise NotImplementedError(
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".
+            format(type(obj)))
+
+
 def save(obj, path, protocol=2, **configs):
     '''
     Save an object to the specified path.
@@ -447,25 +593,29 @@ def save(obj, path, protocol=2, **configs):
             "Type of `use_binary_format` should be bool, but received {}.".
             format(type(config.use_binary_format)))
 
-    # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
-    if config.pickle_protocol is not None:
-        protocol = config.pickle_protocol
-        warnings.warn(
-            "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
-        )
-    if isinstance(obj, Program):
-        obj.desc.flush()
-        with open(path, "wb") as f:
-            f.write(obj.desc.serialize_to_string())
-    elif _use_legacy(obj):
-        if in_dygraph_mode():
-            _legacy_save(obj, path, protocol)
-        else:
-            _legacy_static_save(obj, path, protocol)
+    if config.use_binary_format:
+        _save_binary_var(obj, path)
     else:
-        # save single variable
-        with open(path, 'wb') as f:
-            _pickle_save(obj, f, protocol)
+        # `protocol` need to be used, `pickle_protocol` is a deprecated arg.
+        if config.pickle_protocol is not None:
+            protocol = config.pickle_protocol
+            warnings.warn(
+                "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead."
+            )
+
+        if isinstance(obj, Program):
+            obj.desc.flush()
+            with open(path, "wb") as f:
+                f.write(obj.desc.serialize_to_string())
+
+        elif _is_state_dict(obj):
+            if in_dygraph_mode():
+                _legacy_save(obj, path, protocol)
+            else:
+                _legacy_static_save(obj, path, protocol)
+        else:
+            with open(path, 'wb') as f:
+                _pickle_save(obj, f, protocol)
 
 
 def _legacy_save(obj, path, protocol=2):
@@ -645,8 +795,7 @@ def load(path, **configs):
 
                 # TODO(weixin):If `obj` is any object, the judgment condition should be more precise.
                 if isinstance(load_result, dict):
-                    if isinstance(load_result, dict):
-                        load_result = _pack_loaded_dict(load_result)
+                    load_result = _pack_loaded_dict(load_result)
                     # paddle2.0: paddle.save/load
                     if "StructuredToParameterName@@" in load_result:
 
@@ -658,28 +807,37 @@ def load(path, **configs):
                             del load_result["StructuredToParameterName@@"]
                     else:
                         # paddle2.1 static.save/load
-                        for key in load_result:
-                            load_result[key] = _ndarray_to_tensor(
-                                load_result[key], config.return_numpy)
+                        load_result = _parse_load_result(load_result,
+                                                         config.return_numpy)
 
                 else:
-                    # TODO(weixin): support complex objects such as layer.
-                    # If `obj` is any object, the judgment condition should be more precise.
-                    if _transformed_from_lodtensor(load_result):
-                        load_result = _ndarray_to_tensor(load_result,
-                                                         config.return_numpy)
-                    elif _transformed_from_varbase(load_result):
-                        load_result = _tuple_to_tensor(load_result,
-                                                       config.return_numpy)
+                    load_result = _parse_load_result(load_result,
+                                                     config.return_numpy)
+
+        except exception_type as msg_pickle:
+            try:
+                tensor, _ = _load_selected_rows(path)
+                return tensor
+            except:
+                try:
+                    tensor, _ = _load_lod_tensor(path)
+                    if config.return_numpy:
+                        return np.array(tensor)
                     else:
-                        raise NotImplementedError(
-                            'Only support tensor and state_dict, but received {}.'.
-                            format(type(load_result)))
-        except exception_type:
-            with open(path, "rb") as f:
-                program_desc_str = f.read()
-                program = Program.parse_from_string(program_desc_str)
-                return program
+                        if in_dygraph_mode():
+                            return _lod_tensor2varbase(tensor)
+                        return tensor
+                except:
+                    try:
+                        with open(path, "rb") as f:
+                            program_desc_str = f.read()
+                            program = Program.parse_from_string(
+                                program_desc_str)
+                            return program
+                    except:
+                        raise ValueError(
+                            "`paddle.load` can not parse the file:{}.".format(
+                                path))
 
     else:
         load_result = _legacy_load(path, **configs)
diff --git a/python/paddle/hapi/__init__.py b/python/paddle/hapi/__init__.py
index 0aea557a28c27..6b7672828e63d 100644
--- a/python/paddle/hapi/__init__.py
+++ b/python/paddle/hapi/__init__.py
@@ -15,6 +15,7 @@
 from . import logger
 from . import callbacks
 from . import model_summary
+from . import hub
 
 from . import model
 from .model import *
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
new file mode 100644
index 0000000000000..31a8be0944f3d
--- /dev/null
+++ b/python/paddle/hapi/hub.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import sys
+import shutil
+import zipfile
+from paddle.utils.download import get_path_from_url
+
+DEFAULT_CACHE_DIR = '~/.cache'
+VAR_DEPENDENCY = 'dependencies'
+MODULE_HUBCONF = 'hubconf.py'
+HUB_DIR = os.path.expanduser(os.path.join('~', '.cache', 'paddle', 'hub'))
+
+
+def _remove_if_exists(path):
+    if os.path.exists(path):
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+def _import_module(name, repo_dir):
+    sys.path.insert(0, repo_dir)
+    try:
+        hub_module = __import__(name)
+        sys.modules.pop(name)
+    except ImportError:
+        sys.path.remove(repo_dir)
+        raise RuntimeError(
+            'Cannot import `{}`, please make sure `{}`.py in repo root dir'.
+            format(name, name))
+
+    sys.path.remove(repo_dir)
+
+    return hub_module
+
+
+def _git_archive_link(repo_owner, repo_name, branch, source):
+    if source == 'github':
+        return 'https://github.com/{}/{}/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+    elif source == 'gitee':
+        return 'https://gitee.com/{}/{}/repository/archive/{}.zip'.format(
+            repo_owner, repo_name, branch)
+
+
+def _parse_repo_info(repo, source):
+    branch = 'main' if source == 'github' else 'master'
+    if ':' in repo:
+        repo_info, branch = repo.split(':')
+    else:
+        repo_info = repo
+    repo_owner, repo_name = repo_info.split('/')
+    return repo_owner, repo_name, branch
+
+
+def _make_dirs(dirname):
+    try:
+        from pathlib import Path
+    except ImportError:
+        from pathlib2 import Path
+    Path(dirname).mkdir(exist_ok=True)
+
+
+def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
+    # Setup hub_dir to save downloaded files
+    hub_dir = HUB_DIR
+
+    _make_dirs(hub_dir)
+
+    # Parse github/gitee repo information
+    repo_owner, repo_name, branch = _parse_repo_info(repo, source)
+    # Github allows branch name with slash '/',
+    # this causes confusion with path on both Linux and Windows.
+    # Backslash is not allowed in Github branch name so no need to
+    # to worry about it.
+    normalized_br = branch.replace('/', '_')
+    # Github renames folder repo/v1.x.x to repo-1.x.x
+    # We don't know the repo name before downloading the zip file
+    # and inspect name from it.
+    # To check if cached repo exists, we need to normalize folder names.
+    repo_dir = os.path.join(hub_dir,
+                            '_'.join([repo_owner, repo_name, normalized_br]))
+
+    use_cache = (not force_reload) and os.path.exists(repo_dir)
+
+    if use_cache:
+        if verbose:
+            sys.stderr.write('Using cache found in {}\n'.format(repo_dir))
+    else:
+        cached_file = os.path.join(hub_dir, normalized_br + '.zip')
+        _remove_if_exists(cached_file)
+
+        url = _git_archive_link(repo_owner, repo_name, branch, source=source)
+
+        get_path_from_url(url, hub_dir, decompress=False)
+
+        with zipfile.ZipFile(cached_file) as cached_zipfile:
+            extraced_repo_name = cached_zipfile.infolist()[0].filename
+            extracted_repo = os.path.join(hub_dir, extraced_repo_name)
+            _remove_if_exists(extracted_repo)
+            # Unzip the code and rename the base folder
+            cached_zipfile.extractall(hub_dir)
+
+        _remove_if_exists(cached_file)
+        _remove_if_exists(repo_dir)
+        # rename the repo
+        shutil.move(extracted_repo, repo_dir)
+
+    return repo_dir
+
+
+def _load_entry_from_hubconf(m, name):
+    '''load entry from hubconf
+    '''
+    if not isinstance(name, str):
+        raise ValueError(
+            'Invalid input: model should be a str of function name')
+
+    func = getattr(m, name, None)
+
+    if func is None or not callable(func):
+        raise RuntimeError('Cannot find callable {} in hubconf'.format(name))
+
+    return func
+
+
+def _check_module_exists(name):
+    try:
+        __import__(name)
+        return True
+    except ImportError:
+        return False
+
+
+def _check_dependencies(m):
+    dependencies = getattr(m, VAR_DEPENDENCY, None)
+
+    if dependencies is not None:
+        missing_deps = [
+            pkg for pkg in dependencies if not _check_module_exists(pkg)
+        ]
+        if len(missing_deps):
+            raise RuntimeError('Missing dependencies: {}'.format(', '.join(
+                missing_deps)))
+
+
+def list(repo_dir, source='github', force_reload=False):
+    r"""
+    List all entrypoints available in `github` hubconf.
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): whether to discard the existing cache and force a fresh download, default is `False`.
+    Returns:
+        entrypoints: a list of available entrypoint names
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.list('lyuwenyu/paddlehub_demo:main', source='github', force_reload=False)
+
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entrypoints = [
+        f for f in dir(hub_module)
+        if callable(getattr(hub_module, f)) and not f.startswith('_')
+    ]
+
+    return entrypoints
+
+
+def help(repo_dir, model, source='github', force_reload=False):
+    """
+    Show help information of model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional): default is `False`
+    Return:
+        docs
+
+    Example:
+        ```python
+        import paddle
+
+        paddle.hub.help('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry.__doc__
+
+
+def load(repo_dir, model, source='github', force_reload=False, **kwargs):
+    """
+    Load model
+
+    Args:
+        repo_dir(str): github or local path
+            github path (str): a str with format "repo_owner/repo_name[:tag_name]" with an optional
+                tag/branch. The default branch is `main` if not specified.
+            local path (str): local repo path
+        model (str): model name
+        source (str): `github` | `gitee` | `local`, default is `github`
+        force_reload (bool, optional), default is `False`
+        **kwargs: parameters using for model
+    Return:
+        paddle model
+    Example:
+        ```python
+        import paddle
+        paddle.hub.load('lyuwenyu/paddlehub_demo:main', model='MM', source='github')
+        ```
+    """
+    if source not in ('github', 'gitee', 'local'):
+        raise ValueError(
+            'Unknown source: "{}". Allowed values: "github" | "gitee" | "local".'.
+            format(source))
+
+    if source in ('github', 'gitee'):
+        repo_dir = _get_cache_or_reload(
+            repo_dir, force_reload, True, source=source)
+
+    hub_module = _import_module(MODULE_HUBCONF.split('.')[0], repo_dir)
+
+    _check_dependencies(hub_module)
+
+    entry = _load_entry_from_hubconf(hub_module, model)
+
+    return entry(**kwargs)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 4f3d73b22e390..6cd879c388c1f 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -133,33 +133,59 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         return
     other_endpoints = endpoints[:]
     other_endpoints.remove(current_endpoint)
+    block = program.global_block()
     if rank == 0 and wait_port:
         wait_server_ready(other_endpoints)
-    block = program.global_block()
-    nccl_id_var = block.create_var(
-        name=fluid.unique_name.generate('nccl_id'),
-        persistable=True,
-        type=fluid.core.VarDesc.VarType.RAW)
-
-    block.append_op(
-        type='c_gen_nccl_id',
-        inputs={},
-        outputs={'Out': nccl_id_var},
-        attrs={
-            'rank': rank,
-            'endpoint': current_endpoint,
-            'other_endpoints': other_endpoints
-        })
-
-    block.append_op(
-        type='c_comm_init',
-        inputs={'X': nccl_id_var},
-        outputs={},
-        attrs={
-            'nranks': nranks,
-            'rank': rank,
-            'ring_id': 0,
-        })
+    if core.is_compiled_with_cuda():
+        nccl_id_var = block.create_var(
+            name=fluid.unique_name.generate('nccl_id'),
+            persistable=True,
+            type=fluid.core.VarDesc.VarType.RAW)
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            })
+    elif core.is_compiled_with_npu():
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks
+            })
 
 
 def prepare_distributed_context(place=None):
@@ -861,10 +887,10 @@ class Model(object):
     AdamW and Momentum optimizer. Before using pure float16 training,
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
-    should be cast to float16 by users. Users should also use
-    `paddle.static.amp.fp16_guard` API to limit the range of pure float16
-    training, otherwise, 'use_fp16_guard' should be set to False by users.
-    However, limiting the range of is not supported during training using AMP.
+    should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
+    should be also used to limit the range of pure float16 training, otherwise,
+    'use_fp16_guard' should be set to False by users. However, limiting the
+    range of is not supported during training using AMP.
 
     Args:
         network (paddle.nn.Layer): The network is an instance of
@@ -948,7 +974,7 @@ def run_example_code():
             data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
             model.fit(data, epochs=2, batch_size=32, verbose=1)
 
-          # mixed precision training is only support on GPU now.
+          # mixed precision training is only supported on GPU now.
           if paddle.is_compiled_with_cuda():
             run_example_code()
 
@@ -1436,19 +1462,18 @@ def prepare(self, optimizer=None, loss=None, metrics=None,
                 float16 training is used, the key 'level' of 'amp_configs'
                 should be set to 'O1' or 'O2' respectively. Otherwise, the
                 value of 'level' defaults to 'O0', which means float32
-                training. In addition to 'level', users could pass in more
-                parameters consistent with mixed precision API. The supported
+                training. In addition to 'level', parameters consistent with
+                mixed precision API could also be passed in. The supported
                 keys are: 'init_loss_scaling', 'incr_ratio', 'decr_ratio',
                 'incr_every_n_steps', 'decr_every_n_nan_or_inf',
                 'use_dynamic_loss_scaling', 'custom_white_list',
                 'custom_black_list', and 'custom_black_varnames'or
-                'use_fp16_guard' is only supported in static mode. Users could
-                refer to mixed precision API documentations
-                 :ref:`api_paddle_amp_auto_cast` and
-                 :ref:`api_paddle_amp_GradScaler` for details. For convenience,
-                'amp_configs' could be set to 'O1' or 'O2' if no more
-                parameters are needed. 'amp_configs' could be None in float32
-                training. Default: None.
+                'use_fp16_guard' is only supported in static mode. Mixed
+                precision API documentations  :ref:`api_paddle_amp_auto_cast`
+                and  :ref:`api_paddle_amp_GradScaler` could be referenced
+                for details. For convenience, 'amp_configs' could be set to
+                'O1' or 'O2' if no more parameters are needed. 'amp_configs'
+                could be None in float32 training. Default: None.
         Returns:
             None
         """
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index bf13ff32ca0e4..03e5a88624086 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -14,10 +14,8 @@
 
 from . import optimizer
 from . import checkpoint
-from ..fluid.contrib import reader
 from ..fluid.layer_helper import LayerHelper
 
 __all__ = []
-__all__ += ["reader"]
 __all__ += optimizer.__all__
 __all__ += checkpoint.__all__
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 59e2729941e41..5781f78c6e4e4 100755
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -13,26 +13,37 @@
 # limitations under the License.
 
 # TODO: define all functions about input & output in this directory 
-__all__ = [
-    'Dataset',
-    'IterableDataset',
-    'TensorDataset',
-    'ComposeDataset',
-    'ChainDataset',
-    'BatchSampler',
-    'DistributedBatchSampler',
-    #            'Transform',
-    'DataLoader',
-    'get_worker_info',
-    'Sampler',
-    'SequenceSampler',
-    'RandomSampler',
-    'WeightedRandomSampler',
-    'random_split',
-    'Subset'
-]
 
-from ..fluid.io import DataLoader
-from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
-        ComposeDataset, ChainDataset, WeightedRandomSampler, Subset, random_split
+from ..fluid.io import DataLoader  # noqa: F401
+from ..fluid.dataloader import Dataset  # noqa: F401
+from ..fluid.dataloader import IterableDataset  # noqa: F401
+from ..fluid.dataloader import BatchSampler  # noqa: F401
+from ..fluid.dataloader import get_worker_info  # noqa: F401
+from ..fluid.dataloader import TensorDataset  # noqa: F401
+from ..fluid.dataloader import Sampler  # noqa: F401
+from ..fluid.dataloader import SequenceSampler  # noqa: F401
+from ..fluid.dataloader import RandomSampler  # noqa: F401
+from ..fluid.dataloader import DistributedBatchSampler  # noqa: F401
+from ..fluid.dataloader import ComposeDataset  # noqa: F401
+from ..fluid.dataloader import ChainDataset  # noqa: F401
+from ..fluid.dataloader import WeightedRandomSampler  # noqa: F401
+from ..fluid.dataloader import Subset  # noqa: F401
+from ..fluid.dataloader import random_split  # noqa: F401
+
+__all__ = [ #noqa
+           'Dataset',
+           'IterableDataset',
+           'TensorDataset',
+           'ComposeDataset',
+           'ChainDataset',
+           'BatchSampler',
+           'DistributedBatchSampler',
+           'DataLoader',
+           'get_worker_info',
+           'Sampler',
+           'SequenceSampler',
+           'RandomSampler',
+           'WeightedRandomSampler',
+           'random_split',
+           'Subset'
+]
diff --git a/python/paddle/metric/metrics.py b/python/paddle/metric/metrics.py
index 0784775b6695e..b939f548e9c01 100644
--- a/python/paddle/metric/metrics.py
+++ b/python/paddle/metric/metrics.py
@@ -243,7 +243,7 @@ def __init__(self, topk=(1, ), name=None, *args, **kwargs):
 
     def compute(self, pred, label, *args):
         """
-        Compute the top-k (maxinum value in `topk`) indices.
+        Compute the top-k (maximum value in `topk`) indices.
 
         Args:
             pred (Tensor): The predicted value is a Tensor with dtype
@@ -253,7 +253,7 @@ def compute(self, pred, label, *args):
                 [batch_size, d0, ..., num_classes] in one hot representation.
                 
         Return:
-            Tensor: Correct mask, a tensor with shape [batch_size, topk].
+            Tensor: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
         """
         pred = paddle.argsort(pred, descending=True)
         pred = paddle.slice(
@@ -277,7 +277,7 @@ def update(self, correct, *args):
         returns the accuracy of current step.
         
         Args:
-            correct: Correct mask, a tensor with shape [batch_size, topk].
+            correct: Correct mask, a tensor with shape [batch_size, d0, ..., topk].
 
         Return:
             Tensor: the accuracy of current step.
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 27d8f35234a35..836d4008f7d0b 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -55,6 +55,7 @@
 from .layer.activation import ReLU  #DEFINE_ALIAS
 from .layer.activation import ReLU6  #DEFINE_ALIAS
 from .layer.activation import SELU  #DEFINE_ALIAS
+from .layer.activation import Silu  #DEFINE_ALIAS
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 36f39a5056ed5..98124be7288d0 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -46,6 +46,7 @@
 from .activation import relu6  #DEFINE_ALIAS
 from .activation import selu  #DEFINE_ALIAS
 from .activation import sigmoid  #DEFINE_ALIAS
+from .activation import silu  #DEFINE_ALIAS
 # from .activation import soft_relu  #DEFINE_ALIAS
 from .activation import softmax  #DEFINE_ALIAS
 from .activation import softmax_  #DEFINE_ALIAS
@@ -58,6 +59,7 @@
 from .activation import tanhshrink  #DEFINE_ALIAS
 from .activation import thresholded_relu  #DEFINE_ALIAS
 from .activation import log_softmax  #DEFINE_ALIAS
+from .activation import glu  #DEFINE_ALIAS
 from .common import dropout  #DEFINE_ALIAS
 from .common import dropout2d  #DEFINE_ALIAS
 from .common import dropout3d  #DEFINE_ALIAS
@@ -97,6 +99,7 @@
 # from .extension import temporal_shift  #DEFINE_ALIAS
 # from .extension import warpctc  #DEFINE_ALIAS
 from .extension import diag_embed  #DEFINE_ALIAS
+from .extension import sequence_mask
 # from .lod import sequence_concat        #DEFINE_ALIAS
 # from .lod import sequence_conv        #DEFINE_ALIAS
 # from .lod import sequence_enumerate        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 3553a93dfab20..d74308dc9aa32 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -23,6 +23,8 @@
 from ...tensor.math import tanh_  #DEFINE_ALIAS
 
 from ...tensor.manipulation import _print_warning_in_static_mode
+from ...tensor.manipulation import chunk
+from ...tensor.math import multiply
 
 __all__ = [
     'brelu',
@@ -47,12 +49,14 @@
     'softshrink',
     'softsign',
     'sigmoid',
+    'silu'
     'swish',
     'tanh',
     'tanh_',
     'tanhshrink',
     'thresholded_relu',
     'log_softmax',
+    'glu',
 ]
 
 import warnings
@@ -758,6 +762,39 @@ def selu(x,
     return out
 
 
+def silu(x, name=None):
+    """
+    silu activation.
+    .. math:
+        silu(x) = \frac{x}{1 + e^{-x}}
+    
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+    
+    Examples:
+        .. code-block:: python
+        import paddle
+        import paddle.nn.functional as F
+        
+        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+        out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    if in_dygraph_mode():
+        return core.ops.silu(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'silu')
+    helper = LayerHelper("silu", **locals())
+    out = helper.create_variable_for_type_inference(x.dtype)
+    helper.append_op(type='silu', inputs={'X': x}, outputs={'Out': out})
+    return out
+
+
 def softmax(x, axis=-1, dtype=None, name=None):
     r"""
     This operator implements the softmax layer. The calculation process is as follows:
@@ -1276,3 +1313,50 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         attrs={'axis': axis})
 
     return out
+
+
+def glu(x, axis=-1, name=None):
+    r"""
+    The gated linear unit. The input is evenly splited into 2 parts along a 
+    given axis. The first part is used as the content, and the second part is
+    passed through a sigmoid function then used as the gate. The output is a
+    elementwise multiplication of the content and the gate.
+
+    .. math::
+
+        \mathrm{GLU}(a, b) = a \otimes \sigma(b)
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, float64.
+        axis (int, optional): The axis along which split the input tensor. It 
+            should be in range [-D, D), where D is the dimensions of ``x`` . 
+            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+            Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+    
+    Returns:
+        A Tensor with the same data type as x. The size of the given aixs is 
+        halved.
+    
+    Examples:
+        .. code-block:: python
+        
+            import paddle
+            from paddle.nn import functional as F
+            
+            x = paddle.to_tensor(
+                [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
+                 [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
+            )
+            print(F.glu(x).numpy())
+            # array([[-0.15216254, -0.9004892 ],
+            #        [-1.0577879 , -0.46985325]], dtype=float32)
+        
+    """
+    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
+                             "glu")
+    a, b = chunk(x, 2, axis=axis, name=name)
+    gate = sigmoid(b, name=name)
+    out = paddle.multiply(a, gate, name=name)
+    return out
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 0789d0b67b740..5263d54045ef1 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -414,7 +414,7 @@ def conv2d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -441,8 +441,8 @@ def conv2d(x,
 
         ..  math::
 
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+            H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
         x (Tensor): The input is 4-D Tensor with shape [N, C, H, W], the data type 
@@ -627,7 +627,7 @@ def conv1d_transpose(x,
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         x(Tensor): 3-D tensor with [N, C, L] or [N, L, C] format,
@@ -657,10 +657,7 @@ def conv1d_transpose(x,
             Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain one integer, `(feature_length)`. None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None. output_size and filter_size
-            should not be None at the same time.
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
@@ -854,7 +851,7 @@ def conv2d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -893,8 +890,7 @@ def conv2d_transpose(x,
           If output_size is None, :math:`H_{out} = H^\prime_{out}, W_{out} = W^\prime_{out}`; 
           else, the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[0]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`, 
-          conv2d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[1]`.
 
     Args:
         x(Tensor): 4-D Tensor with [N, C, H, W] or [N, H, W, C] format,
@@ -929,10 +925,7 @@ def conv2d_transpose(x,
             Otherwise, dilation_height = dilation_width = dilation. Default: dilation = 1.
         output_size(int|tuple|list, optional): The output image size. If output size is a
             tuple, it must contain two integers, (image_height, image_width). None if use
-            filter_size, padding, and stride to calculate output_size.
-            If output_size is specified, output_size and filter_size (weight)'s shape 
-            should follow the formula above. Default: None. output_size and filter_size 
-            should not be None at the same time.
+            filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -1090,7 +1083,7 @@ def conv3d(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1246,7 +1239,7 @@ def conv3d_transpose(x,
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -1289,8 +1282,7 @@ def conv3d_transpose(x,
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
-          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv3d_transpose can compute the kernel size automatically.
+          between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`.
 
     Args:
         x(Tensor): The input is 5-D Tensor with shape [N, C, D, H, W] or [N, D, H, W, C], the data type 
@@ -1326,10 +1318,8 @@ def conv3d_transpose(x,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
         output_size(int|list|tuple, optional): The output image size. If output size is a
-            tuple, it must contain three integers, (image_depth, image_height, image_width). This
-            parameter only works when filter_size is None. If output_size and filter_size are 
-            specified at the same time, They should follow the formula above. Default: None. 
-            Output_size and filter_size should not be None at the same time.
+            tuple, it must contain three integers, (image_depth, image_height, image_width).
+            None if use filter_size(shape of weight), padding, and stride to calculate output_size.
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
             will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
             The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 3bbdb89f16c0a..b004d79a877e7 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -14,7 +14,7 @@
 
 # TODO: define the extention functions
 
-__all__ = ['diag_embed']
+__all__ = ['diag_embed', 'sequence_mask']
 
 import numpy as np
 from ...fluid.data_feeder import check_dtype
@@ -23,6 +23,7 @@
 from ...fluid.layers.tensor import assign
 from ...fluid import core, dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
+from ...fluid.layers.sequence_lod import sequence_mask
 
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 6c8a2d1cbce85..6eb316ceeb8c9 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1023,7 +1023,8 @@ def ctc_loss(log_probs,
              input_lengths,
              label_lengths,
              blank=0,
-             reduction='mean'):
+             reduction='mean',
+             norm_by_times=False):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1038,6 +1039,7 @@ def ctc_loss(log_probs,
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
         reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        norm_by_times (bool, default False) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -1101,7 +1103,7 @@ def ctc_loss(log_probs,
 
     """
 
-    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
                                     input_lengths, label_lengths)
 
     loss_out = fluid.layers.squeeze(loss_out, [-1])
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 54824233f7076..e6971b3781c3b 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -223,24 +223,27 @@ def batch_norm(x,
 
     helper = LayerHelper('batch_norm', **locals())
 
-    dtype = x.dtype if x.dtype is not 'float16' else 'float32'
+    param_dtype = x.dtype if x.dtype is not 'float16' else 'float32'
     saved_mean = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
-    batch_norm_out = helper.create_variable_for_type_inference(dtype)
-    reserve_space = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True)
+    batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [batch_norm_out],
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance],
-        "ReserveSpace": [reserve_space]
+        "SavedVariance": [saved_variance]
     }
 
+    if training or trainable_statistics:
+        # reserve_space is only used for training.
+        reserve_space = helper.create_variable_for_type_inference(
+            dtype=x.dtype, stop_gradient=True)
+        outputs["ReserveSpace"] = [reserve_space]
+
     helper.append_op(
         type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index 69cdb7381716b..2a9ae310615ce 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -27,6 +27,7 @@
     'SELU',
     'LeakyReLU',
     'Sigmoid',
+    'Silu',
     'Hardsigmoid',
     'Softmax',
     'Softplus',
@@ -919,6 +920,44 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
+class Silu(layers.Layer):
+    """
+    Silu Activation.
+    .. math::
+
+        Silu(x) = \frac{x}{1 + e^{-x}}
+
+    Parameters:
+        x (Tensor): The input Tensor with data type float32, or float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        - input: Tensor with any shape.
+        - output: Tensor with the same shape as input.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            m = paddle.nn.Silu()
+            out = m(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
+    """
+
+    def __init__(self, name=None):
+        super(Silu, self).__init__()
+        self._name = name
+
+    def forward(self, x):
+        return F.silu(x, self._name)
+
+    def extra_repr(self):
+        name_str = 'name={}'.format(self._name) if self._name else ''
+        return name_str
+
+
 class LogSigmoid(layers.Layer):
     r"""
     LogSigmoid Activation.
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 55f5b823e35f4..b90421c2f8c29 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -199,7 +199,7 @@ class Conv1D(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -226,7 +226,7 @@ class Conv1D(_ConvNd):
 
         .. math::
 
-            L_{out}&= \\frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
+            L_{out}&= \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1 \\
 
     Parameters:
         in_channels(int): The number of channels in the input image.
@@ -258,7 +258,7 @@ class Conv1D(_ConvNd):
             of conv1d. If it is set to None or one attribute of ParamAttr, conv1d
             will create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+            and the :math:`std` is :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
         bias_attr (ParamAttr or bool, optional): The attribute for the bias of conv1d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv1d
@@ -368,7 +368,7 @@ class Conv1DTranspose(_ConvNd):
 
     .. math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -404,7 +404,7 @@ class Conv1DTranspose(_ConvNd):
           so for conv1d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`L_{out} = L^\prime_{out}`;
           else, the :math:`L_{out}` of the output size must between :math:`L^\prime_{out}`
-          and :math:`L^\prime_{out} + stride`. conv1d_transpose can compute the kernel size automatically.
+          and :math:`L^\prime_{out} + stride`.
 
     Args:
         in_channels(int): The number of channels in the input image.
@@ -540,7 +540,7 @@ class Conv2D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -578,7 +578,7 @@ class Conv2D(_ConvNd):
             of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv2d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv2d
@@ -603,9 +603,9 @@ class Conv2D(_ConvNd):
 
         ..  math::
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
     Examples:
 
@@ -696,7 +696,7 @@ class Conv2DTranspose(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     Where:
 
@@ -710,7 +710,7 @@ class Conv2DTranspose(_ConvNd):
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
-        kernel_size(int|list|uple): The kernel size. If kernel_size is a tuple,
+        kernel_size(int|list|tuple): The kernel size. If kernel_size is a tuple,
             it must contain two integers, (kernel_size_H, kernel_size_W).
             Otherwise, the kernel will be a square.
         stride(int|list|tuple, optional): The stride size. If stride is a tuple, it must
@@ -851,7 +851,7 @@ class Conv3D(_ConvNd):
 
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
@@ -889,7 +889,7 @@ class Conv3D(_ConvNd):
             of conv3d. If it is set to None or one attribute of ParamAttr, conv3d
             will create ParamAttr as param_attr. If it is set to None, the parameter
             is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is
-            :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
+            :math:`(\frac{2.0 }{filter\_elem\_num})^{0.5}`. The default value is None.
         bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of conv3d.
             If it is set to False, no bias will be added to the output units.
             If it is set to None or one attribute of ParamAttr, conv3d
@@ -914,11 +914,11 @@ class Conv3D(_ConvNd):
 
         ..  math::
 
-           D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
+           D_{out}&= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (kernel\_size[0] - 1) + 1))}{strides[0]} + 1
 
-           H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
+           H_{out}&= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (kernel\_size[1] - 1) + 1))}{strides[1]} + 1
 
-           W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
+           W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -1010,7 +1010,7 @@ class Conv3DTranspose(_ConvNd):
     
     ..  math::
 
-        Out = \sigma (W \\ast X + b)
+        Out = \sigma (W \ast X + b)
 
     In the above equation:
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index ad046b9041750..2dfb3acca68e1 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1060,6 +1060,7 @@ class CTCLoss(fluid.dygraph.Layer):
         labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
         input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
         label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        norm_by_times (bool, default false) – Whether to normalize the gradients by the number of time-step, which is also the sequence’s length. There is no need to normalize the gradients if reduction mode is 'mean'.
 
     Returns:
         Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
@@ -1122,10 +1123,20 @@ def __init__(self, blank=0, reduction='mean'):
         self.blank = blank
         self.reduction = reduction
 
-    def forward(self, log_probs, labels, input_lengths, label_lengths):
-        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
-                                             label_lengths, self.blank,
-                                             self.reduction)
+    def forward(self,
+                log_probs,
+                labels,
+                input_lengths,
+                label_lengths,
+                norm_by_times=False):
+        return paddle.nn.functional.ctc_loss(
+            log_probs,
+            labels,
+            input_lengths,
+            label_lengths,
+            self.blank,
+            self.reduction,
+            norm_by_times=norm_by_times)
 
 
 class SmoothL1Loss(fluid.dygraph.Layer):
diff --git a/python/paddle/optimizer/__init__.py b/python/paddle/optimizer/__init__.py
index edebfdfcf3710..07d2935bc7646 100644
--- a/python/paddle/optimizer/__init__.py
+++ b/python/paddle/optimizer/__init__.py
@@ -12,19 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'Optimizer', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'RMSProp', 'Adadelta',
-    'SGD', 'Momentum', 'Lamb', 'lr'
-]
+from .optimizer import Optimizer  # noqa: F401
+from .adagrad import Adagrad  # noqa: F401
+from .adam import Adam  # noqa: F401
+from .adamw import AdamW  # noqa: F401
+from .adamax import Adamax  # noqa: F401
+from .rmsprop import RMSProp  # noqa: F401
+from .adadelta import Adadelta  # noqa: F401
+from .sgd import SGD  # noqa: F401
+from .momentum import Momentum  # noqa: F401
+from .lamb import Lamb  # noqa: F401
+from . import lr  # noqa: F401
 
-from .optimizer import Optimizer
-from .adagrad import Adagrad
-from .adam import Adam
-from .adamw import AdamW
-from .adamax import Adamax
-from .rmsprop import RMSProp
-from .adadelta import Adadelta
-from .sgd import SGD
-from .momentum import Momentum
-from .lamb import Lamb
-from . import lr
+__all__ = [     #noqa
+           'Optimizer',
+           'Adagrad',
+           'Adam',
+           'AdamW',
+           'Adamax',
+           'RMSProp',
+           'Adadelta',
+           'SGD',
+           'Momentum',
+           'Lamb'
+]
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index e921eda41cfb6..42e2a5851c21b 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adadelta"]
-
 
 class Adadelta(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index ec14828e693ee..d3077949ff0ae 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Adagrad"]
-
 
 class Adagrad(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 0cafbda893dd2..dcedf4fc5020a 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -24,8 +24,6 @@
 
 import paddle
 
-__all__ = ["Adam"]
-
 
 class Adam(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 4a6c2278a46f4..9d5adf0bba508 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 
-__all__ = ["Adamax"]
-
 
 class Adamax(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 78c9fcb83fc24..eb88a48f30320 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -19,8 +19,6 @@
 from ..fluid.dygraph import base as imperative_base
 import paddle
 
-__all__ = ['AdamW']
-
 
 class AdamW(Adam):
     r"""
@@ -59,7 +57,7 @@ class AdamW(Adam):
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
             only tensors that makes apply_decay_param_fun(Tensor.name)==True
-            will be updated. It only works when we want to specify tensors.
+            will be updated with weight decay. It only works when we want to specify tensors.
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index a692f59de5b5f..bab130ec59098 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["Lamb"]
-
 
 class Lamb(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 484b4fb7246a7..f269bffc75ed9 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -17,10 +17,19 @@
 import warnings
 from paddle import Tensor
 
-__all__ = [
-    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
-    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
+__all__ = [ #noqa
+    'LRScheduler',
+    'NoamDecay',
+    'PiecewiseDecay',
+    'NaturalExpDecay',
+    'InverseTimeDecay',
+    'PolynomialDecay',
+    'LinearWarmup',
+    'ExponentialDecay',
+    'MultiStepDecay',
+    'StepDecay',
+    'LambdaDecay',
+    'ReduceOnPlateau',
     'CosineAnnealingDecay'
 ]
 
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 111b2720c8668..932a4ad100ec4 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -21,7 +21,6 @@
 from ..fluid import layers
 import paddle.fluid as fluid
 from paddle.fluid.regularizer import L2DecayRegularizer
-__all__ = ["Momentum"]
 
 
 class Momentum(Optimizer):
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index b37d172606411..a050852728da9 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -42,8 +42,6 @@
 from .. import compat as cpt
 from .lr import LRScheduler
 
-__all__ = ['Optimizer']
-
 
 class Optimizer(object):
     r"""Optimizer Base class.
@@ -100,8 +98,19 @@ def __init__(self,
                  weight_decay=None,
                  grad_clip=None,
                  name=None):
-        self._parameter_list = list(
-            parameters) if parameters is not None else None
+        if parameters is not None:
+            # paddle.Tensor is also iterable, so here we don't check whether
+            # the input is iterable, if the input is paddle.Tensor, the
+            # list(paddle.Tensor) will be a error value
+            if isinstance(parameters, paddle.Tensor):
+                raise TypeError(
+                    "`parameters` argument given to the optimizer should be "
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".
+                    format(type(parameters)))
+            self._parameter_list = list(parameters)
+        else:
+            self._parameter_list = None
+
         self._name = name
         if framework.in_dygraph_mode():
             if self._parameter_list is None:
@@ -110,7 +119,8 @@ def __init__(self,
                 )
             if weight_decay is not None:
                 for param in self._parameter_list:
-                    if param.regularizer is not None:
+                    if hasattr(param,
+                               'regularizer') and param.regularizer is not None:
                         logging.info(
                             "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                             "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
@@ -433,17 +443,20 @@ def _append_optimize_op(self, block, param_and_grad):
     def _create_param_lr(self, param_and_grad):
         # create learning rate tensor for every parameter
         param = param_and_grad[0]
-        param_lr = param.optimize_attr['learning_rate']
-        if type(param_lr) == Variable:
-            return param_lr
-        else:
-            if param_lr == 1.0:
-                return self._global_learning_rate()
+        if hasattr(param, 'optimize_attr'):
+            param_lr = param.optimize_attr['learning_rate']
+            if type(param_lr) == Variable:
+                return param_lr
             else:
-                with default_main_program()._lr_schedule_guard(
-                        is_with_opt=True), framework.name_scope(
-                            'scale_with_param_lr'):
-                    return self._global_learning_rate() * param_lr
+                if param_lr == 1.0:
+                    return self._global_learning_rate()
+                else:
+                    with default_main_program()._lr_schedule_guard(
+                            is_with_opt=True), framework.name_scope(
+                                'scale_with_param_lr'):
+                        return self._global_learning_rate() * param_lr
+        else:
+            return self._global_learning_rate()
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 12825bb781381..7146b7d89935c 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -17,8 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable
 
-__all__ = ["RMSProp"]
-
 
 class RMSProp(Optimizer):
     r"""
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index b2937ff162064..fc208519a2e61 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -17,7 +17,6 @@
 from ..fluid import framework
 from ..fluid.framework import Variable, name_scope
 from ..fluid.dygraph import no_grad
-__all__ = ["SGD"]
 
 
 class SGD(Optimizer):
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 1a4d45469235d..9002cd0676eda 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -63,7 +63,15 @@ def reader():
 
 """
 
-import paddle.reader.decorator
-from paddle.reader.decorator import *
+from paddle.reader.decorator import map_readers  # noqa: F401
+from paddle.reader.decorator import shuffle  # noqa: F401
+from paddle.reader.decorator import xmap_readers  # noqa: F401
+from paddle.reader.decorator import firstn  # noqa: F401
+from paddle.reader.decorator import buffered  # noqa: F401
+from paddle.reader.decorator import compose  # noqa: F401
+from paddle.reader.decorator import cache  # noqa: F401
+from paddle.reader.decorator import ComposeNotAligned  # noqa: F401
+from paddle.reader.decorator import chain  # noqa: F401
+from paddle.reader.decorator import multiprocess_reader  # noqa: F401
 
 __all__ = []
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 8ee4d73ea847e..4e1c3827d3845 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -12,11 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = [
-    'cache', 'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers', 'multiprocess_reader'
-]
-
 from threading import Thread
 import subprocess
 import multiprocessing
diff --git a/python/paddle/static/nn/__init__.py b/python/paddle/static/nn/__init__.py
index fd84a0a9284ee..0e9754d3c1fbf 100644
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -39,6 +39,21 @@
     'switch_case',
     'while_loop',
     'sparse_embedding',
+    'sequence_conv',
+    'sequence_softmax',
+    'sequence_pool',
+    'sequence_concat',
+    'sequence_first_step',
+    'sequence_last_step',
+    'sequence_slice',
+    'sequence_expand',
+    'sequence_expand_as',
+    'sequence_pad',
+    'sequence_unpad',
+    'sequence_reshape',
+    'sequence_scatter',
+    'sequence_enumerate',
+    'sequence_reverse',
 ]
 
 from .common import fc  #DEFINE_ALIAS
@@ -69,3 +84,19 @@
 
 from ...fluid.input import embedding  #DEFINE_ALIAS
 from ...fluid.contrib.layers import sparse_embedding  #DEFINE_ALIAS
+
+from ...fluid.layers.sequence_lod import sequence_conv
+from ...fluid.layers.sequence_lod import sequence_softmax
+from ...fluid.layers.sequence_lod import sequence_pool
+from ...fluid.layers.sequence_lod import sequence_concat
+from ...fluid.layers.sequence_lod import sequence_first_step
+from ...fluid.layers.sequence_lod import sequence_last_step
+from ...fluid.layers.sequence_lod import sequence_slice
+from ...fluid.layers.sequence_lod import sequence_expand
+from ...fluid.layers.sequence_lod import sequence_expand_as
+from ...fluid.layers.sequence_lod import sequence_pad
+from ...fluid.layers.sequence_lod import sequence_unpad
+from ...fluid.layers.sequence_lod import sequence_reshape
+from ...fluid.layers.sequence_lod import sequence_scatter
+from ...fluid.layers.sequence_lod import sequence_enumerate
+from ...fluid.layers.sequence_lod import sequence_reverse
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 69ee296230383..4cf10f8a69c45 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1036,8 +1036,8 @@ def assign(x, output=None):
     The OP copies the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|numpy.ndarray): A tensor or numpy ndarray, its data type supports
-            float16, float32, float64, int32 and int64.
+        x (Tensor|numpy.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple, list or scalar,
+            its data type supports float16, float32, float64, int32, int64, and bool.
         output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
             be created as :attr:`output`. Default: None.
  
@@ -1058,5 +1058,6 @@ def assign(x, output=None):
           result2 = paddle.assign(data)  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
           result3 = paddle.assign(np.array([[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]], dtype='float32')) # result3 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
-    check_type(x, 'x', (Variable, numpy.ndarray), 'assign')
+    check_type(x, 'x', (Variable, numpy.ndarray, list, tuple, float, int, bool),
+               'assign')
     return tensor.assign(x, output)
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 99f5bf7ba0ad1..583290e431d63 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -39,8 +39,8 @@
 
 def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     """
-    Applies matrix multiplication to two tensors. `matmul` follows 
-    the complete broadcast rules, 
+    Applies matrix multiplication to two tensors. `matmul` follows
+    the complete broadcast rules,
     and its behavior is consistent with `np.matmul`.
 
     Currently, the input tensors' number of dimensions can be any, `matmul` can be used to
@@ -50,8 +50,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
     flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
 
     - If a transpose flag is specified, the last two dimensions of the tensor
-      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor 
-      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas 
+      are transposed. If the tensor is ndim-1 of shape, the transpose is invalid. If the tensor
+      is ndim-1 of shape :math:`[D]`, then for :math:`x` it is treated as :math:`[1, D]`, whereas
       for :math:`y` it is the opposite: It is treated as :math:`[D, 1]`.
 
     The multiplication behavior depends on the dimensions of `x` and `y`. Specifically:
@@ -60,22 +60,22 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
 
     - If both tensors are 2-dimensional, the matrix-matrix product is obtained.
 
-    - If the `x` is 1-dimensional and the `y` is 2-dimensional, 
-      a `1` is prepended to its dimension in order to conduct the matrix multiply. 
+    - If the `x` is 1-dimensional and the `y` is 2-dimensional,
+      a `1` is prepended to its dimension in order to conduct the matrix multiply.
       After the matrix multiply, the prepended dimension is removed.
-      
-    - If the `x` is 2-dimensional and `y` is 1-dimensional, 
+
+    - If the `x` is 2-dimensional and `y` is 1-dimensional,
       the matrix-vector product is obtained.
 
-    - If both arguments are at least 1-dimensional and at least one argument 
-      is N-dimensional (where N > 2), then a batched matrix multiply is obtained. 
-      If the first argument is 1-dimensional, a 1 is prepended to its dimension 
-      in order to conduct the batched matrix multiply and removed after. 
-      If the second argument is 1-dimensional, a 1 is appended to its 
-      dimension for the purpose of the batched matrix multiple and removed after. 
-      The non-matrix (exclude the last two dimensions) dimensions are 
-      broadcasted according the broadcast rule. 
-      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor, 
+    - If both arguments are at least 1-dimensional and at least one argument
+      is N-dimensional (where N > 2), then a batched matrix multiply is obtained.
+      If the first argument is 1-dimensional, a 1 is prepended to its dimension
+      in order to conduct the batched matrix multiply and removed after.
+      If the second argument is 1-dimensional, a 1 is appended to its
+      dimension for the purpose of the batched matrix multiple and removed after.
+      The non-matrix (exclude the last two dimensions) dimensions are
+      broadcasted according the broadcast rule.
+      For example, if input is a (j, 1, n, m) tensor and the other is a (k, m, p) tensor,
       out will be a (j, k, n, p) tensor.
 
     Args:
@@ -177,11 +177,17 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns the matrix norm (Frobenius) or vector norm (the 1-norm, the Euclidean
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
+    .. note::
+        This norm API is different from `numpy.linalg.norm`.
+        This api supports high-order input tensors (rank >= 3), and certain axis need to be pointed out to calculate the norm.
+        But `numpy.linalg.norm` only supports 1-D vector or 2-D matrix as input tensor.
+        For p-order matrix norm, this api actually treats matrix as a flattened vector to calculate the vector norm, NOT REAL MATRIX NORM.
+
     Args:
         x (Tensor): The input tensor could be N-D tensor, and the input data
             type could be float32 or float64.
         p (float|string, optional): Order of the norm. Supported values are `fro`, `0`, `1`, `2`,
-            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm. 
+            `inf`, `-inf` and any positive real number yielding the corresponding p-norm. Not supported: ord < 0 and nuclear norm.
             Default value is `fro`.
         axis (int|list|tuple, optional): The axis on which to apply norm operation. If axis is int
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
@@ -198,10 +204,10 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns:
         Tensor: results of norm operation on the specified axis of input tensor,
         it's data type is the same as input's Tensor.
- 
+
     Examples:
         .. code-block:: python
-            
+
             import paddle
             import numpy as np
             shape=[2, 3, 4]
@@ -344,6 +350,10 @@ def inf_norm(input,
         return reduce_out
 
     def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+        """
+        NOTE:
+            This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
+        """
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
             dtype=block.input_dtype())
@@ -548,10 +558,10 @@ def dist(x, y, p=2):
 def dot(x, y, name=None):
     """
     This operator calculates inner product for vectors.
-   
+
     .. note::
-       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix 
-       is the batch dimension, which means that the vectors of multiple batches are dotted. 
+       Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix
+       is the batch dimension, which means that the vectors of multiple batches are dotted.
 
     Parameters:
         x(Tensor): 1-D or 2-D ``Tensor``. Its dtype should be ``float32``, ``float64``, ``int32``, ``int64``
@@ -604,17 +614,17 @@ def dot(x, y, name=None):
 
 def t(input, name=None):
     """
-    Transpose <=2-D tensor. 
-    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to 
+    Transpose <=2-D tensor.
+    0-D and 1-D tensors are returned as it is and 2-D tensor is equal to
     the paddle.transpose function which perm dimensions set 0 and 1.
-    
+
     Args:
         input (Tensor): The input Tensor. It is a N-D (N<=2) Tensor of data types float16, float32, float64, int32.
-        name(str, optional): The default value is None.  Normally there is no need for 
+        name(str, optional): The default value is None.  Normally there is no need for
             user to set this property.  For more information, please refer to :ref:`api_guide_Name`
     Returns:
         Tensor: A transposed n-D Tensor, with data type being float16, float32, float64, int32, int64.
-    
+
     For Example:
 
         .. code-block:: text
@@ -679,10 +689,10 @@ def t(input, name=None):
 def cross(x, y, axis=None, name=None):
     """
     Computes the cross product between two tensors along an axis.
-    
+
     Inputs must have the same shape, and the length of their axes should be equal to 3.
     If `axis` is not given, it defaults to the first axis found with the length 3.
-    
+
     Args:
         x (Tensor): The first input tensor.
         y (Tensor): The second input tensor.
@@ -691,7 +701,7 @@ def cross(x, y, axis=None, name=None):
 
     Returns:
         Tensor. A Tensor with same data type as `x`.
-        
+
     Examples:
         .. code-block:: python
 
@@ -737,8 +747,8 @@ def cross(x, y, axis=None, name=None):
 def cholesky(x, upper=False, name=None):
     r"""
     Computes the Cholesky decomposition of one symmetric positive-definite
-    matrix or batches of symmetric positive-definite matrice. 
-    
+    matrix or batches of symmetric positive-definite matrice.
+
     If `upper` is `True`, the decomposition has the form :math:`A = U^{T}U` ,
     and the returned matrix :math:`U` is upper-triangular. Otherwise, the
     decomposition has the form  :math:`A = LL^{T}` , and the returned matrix
@@ -755,7 +765,7 @@ def cholesky(x, upper=False, name=None):
     Returns:
         Tensor: A Tensor with same shape and data type as `x`. It represents \
             triangular matrices generated by Cholesky decomposition.
-        
+
     Examples:
         .. code-block:: python
 
@@ -845,7 +855,7 @@ def bmm(x, y, name=None):
 
 def histogram(input, bins=100, min=0, max=0):
     """
-    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max. 
+    Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max.
     If min and max are both zero, the minimum and maximum values of the data are used.
 
     Args:
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 696775434b967..669225d813641 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,7 +16,7 @@
 
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard, dygraph_only
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
@@ -76,6 +76,42 @@ def _print_warning_in_static_mode(api_name):
         format(api_name, api_name))
 
 
+@dygraph_only
+def tolist(x):
+    """
+    **Notes**:
+        **This API is ONLY available in Dygraph mode**
+
+    This function translate the paddle.Tensor to python list.
+
+    Args:
+        x(Tensor): ``x`` is the Tensor we want to translate to list
+
+    Returns:
+        list: A list that contain the same value of current Tensor.
+
+    Returns type:
+        list: dtype is same as current Tensor
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            t = paddle.to_tensor([0,1,2,3,4])
+            expectlist = t.tolist()
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+            expectlist = paddle.tolist(t)
+            print(expectlist)   #[0, 1, 2, 3, 4]
+
+    """
+    return x.numpy().tolist()
+
+
+setattr(core.VarBase, 'tolist', tolist)
+
+
 def concat(x, axis=0, name=None):
     """
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 87efa9ac442b6..215d467828a14 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1475,10 +1475,10 @@ def clip(x, min=None, max=None, name=None):
         Out = MIN(MAX(x, min), max)
 
     Args:
-        x (Tensor): An N-D Tensor with data type float32 or float64.
-        min (float32|Tensor): The lower bound with type ``float32`` or a ``Tensor``
+        x (Tensor): An N-D Tensor with data type float32, float64, int32 or int64.
+        min (float|int|Tensor): The lower bound with type ``float`` , ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
-        max (float32|Tensor): The upper bound with type ``float32`` or a ``Tensor``
+        max (float|int|Tensor): The upper bound with type ``float``, ``int`` or a ``Tensor``
             with shape [1] and type ``int32``, ``float32``, ``float64``.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1503,16 +1503,24 @@ def clip(x, min=None, max=None, name=None):
             # [[4.5, 6.4]
     """
 
-    fmin = float(np.finfo(np.float32).min)
-    fmax = float(np.finfo(np.float32).max)
+    x_dtype = str(x.dtype)
+    if x_dtype == 'paddle.int32':
+        min_ = np.iinfo(np.int32).min
+        max_ = np.iinfo(np.int32).max - 2**7
+    elif x_dtype == 'paddle.int64':
+        min_ = np.iinfo(np.int64).min
+        max_ = np.iinfo(np.int64).max - 2**39
+    else:
+        min_ = float(np.finfo(np.float32).min)
+        max_ = float(np.finfo(np.float32).max)
 
     if in_dygraph_mode():
         if isinstance(min, Variable):
             min = min.numpy().item(0)
         if isinstance(max, Variable):
             max = max.numpy().item(0)
-        min = fmin if min is None else min
-        max = fmax if max is None else max
+        min = min_ if min is None else min
+        max = max_ if max is None else max
         return core.ops.clip(x, "min", min, "max", max)
 
     if min is not None:
@@ -1526,10 +1534,10 @@ def clip(x, min=None, max=None, name=None):
             check_dtype(max.dtype, 'max', ['float32', 'float64', 'int32'],
                         'clip', '(When the type of max in clip is Variable.)')
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'clip')
+    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'], 'clip')
 
     inputs = {'X': x}
-    attrs = {'min': fmin, 'max': fmax}
+    attrs = {'min': min_, 'max': max_}
 
     if isinstance(min, Variable):
         min.stop_gradient = True
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 778a391df605e..e5148d039c927 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -93,6 +93,10 @@ def set_printoptions(precision=None,
 def _to_sumary(var):
     edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
 
+    # Handle tensor of shape contains 0, like [0, 2], [3, 0, 3]
+    if np.prod(var.shape) == 0:
+        return np.array([])
+
     if len(var.shape) == 0:
         return var
     elif len(var.shape) == 1:
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 9a676b6b7396b..bb572973fdb36 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -49,3 +49,4 @@ set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
 set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 300)
 set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 
+set_tests_properties(test_hapi_hub PROPERTIES TIMEOUT 300) 
diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/tests/hubconf.py
similarity index 63%
rename from python/paddle/fluid/contrib/utils/__init__.py
rename to python/paddle/tests/hubconf.py
index 1c1c2fb227091..4b4a853ef2cd9 100644
--- a/python/paddle/fluid/contrib/utils/__init__.py
+++ b/python/paddle/tests/hubconf.py
@@ -1,23 +1,24 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-#
+# 
 #     http://www.apache.org/licenses/LICENSE-2.0
-#
+# 
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-from . import lookup_table_utils
-from .lookup_table_utils import *
-from . import hdfs_utils
-from .hdfs_utils import *
+dependencies = ['paddle']
 
-__all__ = []
-__all__ += lookup_table_utils.__all__
-__all__ += hdfs_utils.__all__
+import paddle
+from test_hapi_hub_model import MM as _MM
+
+
+def MM(out_channels=8, pretrained=False):
+    '''This is a test demo for paddle hub
+    '''
+    return _MM(out_channels)
diff --git a/python/paddle/tests/test_hapi_hub.py b/python/paddle/tests/test_hapi_hub.py
new file mode 100644
index 0000000000000..06000d6c83367
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import os
+
+import paddle
+from paddle.hapi import hub
+
+import numpy as np
+
+
+class TestHub(unittest.TestCase):
+    def setUp(self, ):
+        self.local_repo = os.path.dirname(os.path.abspath(__file__))
+        self.github_repo = 'lyuwenyu/paddlehub_demo:main'
+
+    def testLoad(self, ):
+        model = hub.load(
+            self.local_repo, model='MM', source='local', out_channels=8)
+
+        data = paddle.rand((1, 3, 100, 100))
+        out = model(data)
+        np.testing.assert_equal(out.shape, [1, 8, 50, 50])
+
+        model = hub.load(
+            self.github_repo, model='MM', source='github', force_reload=True)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo.split(':')[0],
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=False)
+
+        model = hub.load(
+            self.github_repo,
+            model='MM',
+            source='github',
+            force_reload=False,
+            pretrained=True,
+            out_channels=8)
+
+        data = paddle.ones((1, 3, 2, 2))
+        out = model(data)
+
+        gt = np.array([
+            1.53965068, 0., 0., 1.39455748, 0.72066200, 0.19773030, 2.09201908,
+            0.37345418
+        ])
+        np.testing.assert_equal(out.shape, [1, 8, 1, 1])
+        np.testing.assert_almost_equal(
+            out.numpy(), gt.reshape(1, 8, 1, 1), decimal=5)
+
+    def testHelp(self, ):
+        docs1 = hub.help(
+            self.local_repo,
+            model='MM',
+            source='local', )
+
+        docs2 = hub.help(
+            self.github_repo, model='MM', source='github', force_reload=False)
+
+        assert docs1 == docs2 == 'This is a test demo for paddle hub\n    ', ''
+
+    def testList(self, ):
+        models1 = hub.list(
+            self.local_repo,
+            source='local',
+            force_reload=False, )
+
+        models2 = hub.list(
+            self.github_repo,
+            source='github',
+            force_reload=False, )
+
+        assert models1 == models2 == ['MM'], ''
+
+    def testExcept(self, ):
+        with self.assertRaises(ValueError):
+            _ = hub.help(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.github_repo,
+                model='MM',
+                source='github-test',
+                force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.list(
+                self.github_repo, source='github-test', force_reload=False)
+
+        with self.assertRaises(ValueError):
+            _ = hub.load(
+                self.local_repo, model=123, source='local', force_reload=False)
+
+        with self.assertRaises(RuntimeError):
+            _ = hub.load(
+                self.local_repo,
+                model='123',
+                source='local',
+                force_reload=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_hapi_hub_model.py b/python/paddle/tests/test_hapi_hub_model.py
new file mode 100644
index 0000000000000..774c7f6f33a65
--- /dev/null
+++ b/python/paddle/tests/test_hapi_hub_model.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class MM(nn.Layer):
+    def __init__(self, out_channels):
+        super(MM, self).__init__()
+        self.conv = nn.Conv2D(3, out_channels, 3, 2, 1)
+
+    def forward(self, x):
+        out = self.conv(x)
+        out = F.relu(out)
+
+        return out
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 30ff3f81ca7af..aa5a7ab533a28 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -612,12 +612,34 @@ def find_paddle_includes(use_cuda=False):
 
     if OS_NAME.startswith('darwin'):
         # NOTE(Aurelius84): Ensure to find std v1 headers correctly.
-        std_v1_includes = '/Library/Developer/CommandLineTools/usr/include/c++/v1/'
-        include_dirs.append(std_v1_includes)
+        std_v1_includes = find_clang_cpp_include()
+        if std_v1_includes is not None and os.path.exists(std_v1_includes):
+            include_dirs.append(std_v1_includes)
 
     return include_dirs
 
 
+def find_clang_cpp_include(compiler='clang'):
+    std_v1_includes = None
+    try:
+        compiler_version = subprocess.check_output([compiler, "--version"])
+        if six.PY3:
+            compiler_version = compiler_version.decode()
+        infos = compiler_version.split("\n")
+        for info in infos:
+            if "InstalledDir" in info:
+                v1_path = info.split(':')[-1].strip()
+                if v1_path and os.path.exists(v1_path):
+                    std_v1_includes = os.path.join(
+                        os.path.dirname(v1_path), 'include/c++/v1')
+    except Exception:
+        # Just raise warnings because the include dir is not required.
+        warnings.warn(
+            "Failed to search `include/c++/v1/` include dirs. Don't worry because it's not required."
+        )
+    return std_v1_includes
+
+
 def find_cuda_libraries():
     """
     Use heuristic method to find cuda static lib path
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b7d7d0b5adb54..dda8abeff21c0 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -117,7 +117,11 @@ def _get_unique_endpoints(trainer_endpoints):
     return unique_endpoints
 
 
-def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+def get_path_from_url(url,
+                      root_dir,
+                      md5sum=None,
+                      check_exist=True,
+                      decompress=True):
     """ Download from given url to root_dir.
     if file or directory specified by url is exists under
     root_dir, return the path directly, otherwise download
@@ -152,7 +156,8 @@ def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
                 time.sleep(1)
 
     if ParallelEnv().current_endpoint in unique_endpoints:
-        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+        if decompress and (tarfile.is_tarfile(fullpath) or
+                           zipfile.is_zipfile(fullpath)):
             fullpath = _decompress(fullpath)
 
     return fullpath
diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
index 3b98680c89f25..b39009985e735 100644
--- a/python/paddle/utils/install_check.py
+++ b/python/paddle/utils/install_check.py
@@ -74,6 +74,34 @@ def _is_cuda_available():
         return False
 
 
+def _run_dygraph_single(use_cuda):
+    """
+    Testing the simple network in dygraph mode using one CPU/GPU.
+
+    Args:
+        use_cuda (bool): Whether running with CUDA.
+    """
+    paddle.disable_static()
+    if use_cuda:
+        paddle.set_device('gpu')
+    else:
+        paddle.set_device('cpu')
+    weight_attr = paddle.ParamAttr(
+        name="weight", initializer=paddle.nn.initializer.Constant(value=0.5))
+    bias_attr = paddle.ParamAttr(
+        name="bias", initializer=paddle.nn.initializer.Constant(value=1.0))
+    linear = paddle.nn.Linear(
+        2, 4, weight_attr=weight_attr, bias_attr=bias_attr)
+    input_np = _prepare_data(1)
+    input_tensor = paddle.to_tensor(input_np)
+    linear_out = linear(input_tensor)
+    out = paddle.tensor.sum(linear_out)
+    out.backward()
+    opt = paddle.optimizer.Adam(
+        learning_rate=0.001, parameters=linear.parameters())
+    opt.step()
+
+
 def _run_static_single(use_cuda):
     """
     Testing the simple network with executor running directly, using one CPU/GPU.
@@ -152,7 +180,11 @@ def run_check():
 
     print("Running verify PaddlePaddle program ... ")
 
-    use_cuda = _is_cuda_available()
+    if paddle.is_compiled_with_cuda():
+        use_cuda = _is_cuda_available()
+    else:
+        use_cuda = False
+
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
@@ -162,6 +194,7 @@ def run_check():
     device_count = len(device_list)
 
     _run_static_single(use_cuda)
+    _run_dygraph_single(use_cuda)
     print("PaddlePaddle works well on 1 {}.".format(device_str))
 
     try:
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 7d3d5f525c2c7..921e78cace6b3 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -104,7 +104,7 @@ class Compose(object):
 
             for i in range(10):
                 sample = flowers[i]
-                print(sample[0].shape, sample[1])
+                print(sample[0].size, sample[1])
 
     """
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 0d6a5c101ccc1..0e94d02cd6f9b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -159,7 +159,8 @@ packages=['paddle',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
           'paddle.distributed.fleet.meta_parallel',
-          'paddle.distributed.fleet.meta_parallel.mp_utils',
+          'paddle.distributed.fleet.meta_parallel.pp_utils',
+          'paddle.distributed.fleet.meta_parallel.parallel_layers',
           'paddle.framework',
           'paddle.jit',
           'paddle.jit.dy2static',
@@ -177,11 +178,9 @@ packages=['paddle',
           'paddle.fluid.contrib',
           'paddle.fluid.contrib.decoder',
           'paddle.fluid.contrib.quantize',
-          'paddle.fluid.contrib.reader',
           'paddle.fluid.contrib.slim',
           'paddle.fluid.contrib.slim.quantization',
           'paddle.fluid.contrib.slim.quantization.imperative',
-          'paddle.fluid.contrib.utils',
           'paddle.fluid.contrib.extend_optimizer',
           'paddle.fluid.contrib.mixed_precision',
           'paddle.fluid.contrib.mixed_precision.bf16',
@@ -189,7 +188,6 @@ packages=['paddle',
           'paddle.fluid.transpiler',
           'paddle.fluid.transpiler.details',
           'paddle.fluid.incubate',
-          'paddle.fluid.incubate.data_generator',
           'paddle.fluid.incubate.fleet',
           'paddle.fluid.incubate.checkpoint',
           'paddle.fluid.incubate.fleet.base',
@@ -395,15 +393,22 @@ if os.name == 'nt':
 elif sys.platform == 'darwin':
     ext_modules = []
 
-def find_files(pattern, root):
+def find_files(pattern, root, recursive=False):
     for dirpath, _, files in os.walk(root):
-      for filename in fnmatch.filter(files, pattern):
-          yield os.path.join(dirpath, filename)
+        for filename in fnmatch.filter(files, pattern):
+            yield os.path.join(dirpath, filename)
+        if not recursive:
+            break
 
 headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
-    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +  # extension
-    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')))  # boost
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) +  # extension
+    list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
+    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
+    # to `extension/incude`,
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
@@ -443,35 +448,18 @@ class InstallHeaders(Command):
                                    ('install_headers', 'install_dir'),
                                    ('force', 'force'))
 
-    def copy_data_type_headers(self):
-        # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
-        # to `extension/incude`,
-        data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + 
-                             ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
-
-        install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include")
-        if not os.path.exists(install_dir):
-            self.mkpath(install_dir)
-        for header in data_type_headers:
-            self.copy_file(header, install_dir)
-
     def mkdir_and_copy_file(self, header):
         if 'pb.h' in header:
             install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header)
         elif 'third_party' not in header:
             # paddle headers
             install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header)
+            if 'fluid' in install_dir:
+                install_dir = "paddle/extension/include/"
         else:
             # third_party
             install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header)
-            patterns = ['eigen3/src/extern_eigen3', 'boost/src/extern_boost',
-                       'dlpack/src/extern_dlpack/include',
-                       'install/protobuf/include',
-                       'install/gflags/include',
-                       'install/glog/include', 'install/xxhash/include',
-                       'install/mkldnn/include',
-                       'threadpool/src/extern_threadpool']
+            patterns = ['boost/src/extern_boost', 'install/mkldnn/include']
             for pattern in patterns:
                 install_dir = re.sub(pattern, '', install_dir)
         install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))
@@ -487,7 +475,6 @@ class InstallHeaders(Command):
         for header in hdrs:
             (out, _) = self.mkdir_and_copy_file(header)
             self.outfiles.append(out)
-        self.copy_data_type_headers()
 
     def get_inputs(self):
         return self.distribution.headers or []
@@ -519,6 +506,12 @@ else:
     with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f:
         long_description = unicode(f.read(), 'UTF-8')
 
+# strip *.so to reduce package size
+if '${WITH_STRIP}' == 'ON':
+    command = 'find ${PADDLE_BINARY_DIR}/python/paddle -name "*.so" | xargs -i strip {}'
+    if os.system(command) != 0:
+        raise Exception("strip *.so failed, command: %s" % command)
+
 with redirect_stdout():
     setup(name='${PACKAGE_NAME}',
         version='${PADDLE_VERSION}',
diff --git a/tools/__pycache__/static_mode_white_list.cpython-37.pyc b/tools/__pycache__/static_mode_white_list.cpython-37.pyc
new file mode 100644
index 0000000000000..b1e58ce7689c7
Binary files /dev/null and b/tools/__pycache__/static_mode_white_list.cpython-37.pyc differ
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 05466883e58d2..b1395c28878e3 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -84,6 +84,7 @@ function run_test_sampcd_processor() {
     CUR_PWD=$(pwd)
     cd ${PADDLE_ROOT}/tools
     python test_sampcd_processor.py
+    python test_print_signatures.py
     cd ${CUR_PWD}
 }
 
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index 5e87804179f07..813781b5e79ce 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -50,17 +50,20 @@ RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/re
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install setuptools -U
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install -r /root/requirements.txt && \
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.9.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 2cae7896d6483..9500acb2f977c 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -65,6 +65,12 @@ RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \
     CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
     make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
 
+# Install Python3.9
+RUN wget -q https://www.python.org/ftp/python/3.9.0/Python-3.9.0.tgz && \
+    tar -xzf Python-3.9.0.tgz && cd Python-3.9.0 && \
+    CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
+    make -j8 > /dev/null && make altinstall > /dev/null && ldconfig
+
 ENV PATH=/usr/local/python3.7.0/include:${PATH}
 ENV PATH=/usr/local/python3.7.0/bin:${PATH}
 ENV LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:${LD_LIBRARY_PATH}
@@ -92,16 +98,28 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44
 WORKDIR /home/setuptools-40.6.2
 RUN python setup.py build && python setup.py install
 WORKDIR /home
+RUN python3.9 -m pip uninstall -y pip setuptools && \
+  python3.8 -m pip uninstall -y pip setuptools && \
+  python3.7 -m pip uninstall -y pip setuptools && \
+  python3.6 -m pip uninstall -y pip setuptools
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
 RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
 WORKDIR pip-20.0.1
 RUN python setup.py install && \
+  python3.9 setup.py install && \
   python3.8 setup.py install && \
   python3.7 setup.py install && \
   python3.6 setup.py install
 
 WORKDIR /home
-RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \
-    rm -r Python-$version setuptools-40.6.2 pip-20.0.1
+RUN rm Python-$version.tgz setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r Python-$version setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
 
 # Install Go and glide
 WORKDIR /home
@@ -147,6 +165,9 @@ RUN pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
     pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip3.9 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
+    pip3.9 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3.9 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \
     pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \
     pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark
 
@@ -156,6 +177,8 @@ RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3.9 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
+    pip3.9 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0'
 
@@ -163,17 +186,20 @@ RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
 RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
     pip --no-cache-dir install pylint pytest astroid isort
 
 RUN pip3.6 --no-cache-dir install coverage && \
     pip3.7 --no-cache-dir install coverage && \
     pip3.8 --no-cache-dir install coverage && \
+    pip3.9 --no-cache-dir install coverage && \
     pip --no-cache-dir install coverage
 
 COPY ./python/requirements.txt /root/
 RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
@@ -182,9 +208,11 @@ RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \
     pip3.6 install --upgrade pip && \ 
     pip3.7 install --upgrade pip && \ 
     pip3.8 install --upgrade pip && \ 
+    pip3.9 install --upgrade pip && \ 
     pip3.6 --no-cache-dir install certifi urllib3[secure] && \
     pip3.7 --no-cache-dir install certifi urllib3[secure] && \
     pip3.8 --no-cache-dir install certifi urllib3[secure] && \
+    pip3.9 --no-cache-dir install certifi urllib3[secure] && \
     pip --no-cache-dir install certifi urllib3[secure]
 
 # ar mishandles 4GB files
diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
index d6c4753e74675..7dad70f00d476 100644
--- a/tools/dockerfile/Dockerfile.ubuntu18
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -36,27 +36,48 @@ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc
 RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
 ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
 
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
 RUN apt-get update && \
   apt-get install -y python2.7 python2.7-dev \
-  python3.5 python3.5-dev \
   python3.6 python3.6-dev \
   python3.7 python3.7-dev \
-  python3.8 python3.8-dev python3.8-distutils && \
-  curl https://bootstrap.pypa.io/2.7/get-pip.py -o - | python2.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/3.5/get-pip.py -o - | python3.5 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
-  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  python3.8 python3.8-dev python3.8-distutils \
+  python3.9 python3.9-dev python3.9-distutils && \
   rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
-  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
-  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
-  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.7 /usr/bin/python3
 
 
-# install cmake
 WORKDIR /home
-RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
-ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+RUN wget https://files.pythonhosted.org/packages/a7/e0/30642b9c2df516506d40b563b0cbd080c49c6b3f11a70b4c7a670f13a78b/setuptools-50.3.2.zip && apt-get -y install unzip && unzip setuptools-50.3.2.zip
+WORKDIR /home/setuptools-50.3.2
+RUN python3.9 setup.py build && python3.9 setup.py install && \
+  python3.8 setup.py build && python3.8 setup.py install && \
+  python3.7 setup.py build && python3.7 setup.py install && \
+  python3.6 setup.py build && python3.6 setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip && apt-get -y install unzip && unzip setuptools-40.6.2.zip
+WORKDIR /home/setuptools-40.6.2
+RUN python setup.py build && python setup.py install
+WORKDIR /home
+RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz
+WORKDIR pip-20.0.1
+RUN python setup.py install && \
+  python3.9 setup.py install && \
+  python3.8 setup.py install && \
+  python3.7 setup.py install && \
+  python3.6 setup.py install
+
+WORKDIR /home
+RUN rm setuptools-40.6.2.zip setuptools-50.3.2.zip pip-20.0.1.tar.gz && \
+    rm -r setuptools-40.6.2 setuptools-50.3.2 pip-20.0.1
+RUN rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.7 /usr/local/bin/pip3
 
 
 # remove them when apt-get support 2.27 and higher version
@@ -84,29 +105,29 @@ RUN git config --global credential.helper store
 # Fix locales to en_US.UTF-8
 RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 
-RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
-    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
-    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+RUN pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.9 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.9 --no-cache-dir install ipykernel==4.6.0 wheel && \
     pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
     pip --no-cache-dir install ipykernel==4.6.0 wheel 
 
 #For docstring checker
-RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
-    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \
     pip3.7 --no-cache-dir install pylint pytest astroid isort && \
     pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.9 --no-cache-dir install pylint pytest astroid isort && \
     pip --no-cache-dir install pylint pytest astroid isort
 
 COPY ./python/requirements.txt /root/
-RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
-    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.7 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.9 --no-cache-dir install -r /root/requirements.txt && \
     pip --no-cache-dir install -r /root/requirements.txt
 
 
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 41f6e18f547cc..393bd045fb7f8 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -24,7 +24,7 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0"
+CPYTHON_VERSIONS="3.9.0 3.8.0 3.7.0 3.6.0"
 
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
@@ -80,11 +80,12 @@ build_cpythons $CPYTHON_VERSIONS
 PY36_BIN=/opt/python/cp36-cp36m/bin
 PY37_BIN=/opt/python/cp37-cp37m/bin
 PY38_BIN=/opt/python/cp38-cp38m/bin
+PY39_BIN=/opt/python/cp39-cp39m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib:$(dirname ${PY39_BIN})/lib"
 
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
diff --git a/tools/dockerfile/build_scripts/build_utils.sh b/tools/dockerfile/build_scripts/build_utils.sh
index d3098686594c0..bb560d0fdf227 100755
--- a/tools/dockerfile/build_scripts/build_utils.sh
+++ b/tools/dockerfile/build_scripts/build_utils.sh
@@ -102,6 +102,9 @@ function do_cpython_build {
     if [ -e ${prefix}/bin/python3.8 ]; then
         ln -s python3.8 ${prefix}/bin/python
     fi
+    if [ -e ${prefix}/bin/python3.9 ]; then
+        ln -s python3.9 ${prefix}/bin/python
+    fi
     # NOTE Make libpython shared library visible to python calls below
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/python get-pip.py
     LD_LIBRARY_PATH="/usr/local/ssl/lib:${prefix}/lib" ${prefix}/bin/pip install wheel==0.32.2
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 5f8a48c8067a5..0c738de62eaaf 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -20,36 +20,36 @@ REPO="${REPO:-paddledocker}"
 
 function make_cuda9cudnn7(){
   sed 's/<baseimg>/9.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
 
 }
 
 
 function make_cuda10cudnn7() {
   sed 's/<baseimg>/10.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
 
 }
 
 
 function make_cuda101cudnn7() {
   sed 's/<baseimg>/10.1-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp 
 }
 
 function make_cuda102cudnn7() {
   sed 's/<baseimg>/10.2-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
 function make_cuda102cudnn8() {
   sed 's/<baseimg>/10.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
 function make_cuda11cudnn8() {
   sed 's/<baseimg>/11.0-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
-  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/gcc /usr/bin/gcc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
 function main() {
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 0f745f212078f..d2969618b85e8 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -425,7 +425,6 @@
     'cpu_info_test',
     'cpu_helper_test',
     'cow_ptr_tests',
-    'convert_model2dot_ernie',
     'conditional_block_op_test',
     'cipher_utils_test',
     'check_reduce_rank_test',
@@ -483,6 +482,7 @@
 # It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, 
 # just remove it from this list.
 TWO_PARALLEL_JOB = [
+    'convert_model2dot_ernie',
     'im2col_test',
     'test_elementwise_add_grad_grad',
     'test_logical_op',
@@ -512,13 +512,10 @@
     'test_activation_nn_grad',
     'test_pool2d_int8_mkldnn_op',
     'test_adagrad_op_v2',
-    'test_elementwise_add_op',
     'test_nn_functional_hot_op',
     'test_op_name_conflict',
-    'test_softmax_with_cross_entropy_op',
     'test_imperative_gan',
     'test_simnet',
-    'test_instance_norm_op',
     'test_amp_check_finite_and_scale_op',
     'test_random_seed',
     'test_histogram_op',
@@ -539,7 +536,6 @@
     'test_sigmoid_cross_entropy_with_logits_op',
     'test_regularizer_api',
     'test_lrn_op',
-    'test_rank_attention_op',
     'test_parallel_ssa_graph_inference_feed_partial_data',
     'test_lod_reset_op',
     'test_install_check',
@@ -554,14 +550,12 @@
     'test_gather_tree_op',
     'test_decoupled_py_reader',
     'test_imperative_named_members',
-    'test_conv3d_op',
     'test_seqconv_eltadd_relu_fuse_pass',
     'test_analysis_predictor',
     'test_convert_operators',
     'test_add_reader_dependency',
     'test_is_tensor',
     'test_variable',
-    'test_unsqueeze_op',
     'test_save_model_without_var',
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
@@ -618,7 +612,6 @@
     'test_adamax_op',
     'test_while_loop_op',
     'test_affine_grid_function',
-    'test_trilinear_interp_op',
     'test_transpose_flatten_concat_fuse_pass',
     'test_trace_op',
     'test_backward',
@@ -813,7 +806,6 @@
     'test_sequence_expand_as',
     'test_sequence_reverse',
     'test_shape_op',
-    'test_lod_tensor',
     'test_diag',
     'test_strided_slice_op',
     'test_switch_case',
@@ -831,7 +823,6 @@
     'test_arange',
     'test_lrn_mkldnn_op',
     'test_imperative_gnn',
-    'test_eager_deletion_while_op',
     'test_dequantize_abs_max_op',
     'test_elementwise_mul_op',
     'test_tensor_scalar_type_promotion_dynamic',
@@ -873,7 +864,6 @@
     'test_manual_seed',
     'test_buffer_shared_memory_reuse_pass',
     'test_range',
-    'test_activation_op',
     'test_box_decoder_and_assign_op',
     'test_imperative_optimizer_v2',
     'test_python_operator_overriding',
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index a18774a8b57b6..cfe34fa342656 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -25,21 +25,13 @@
 import sys
 import pydoc
 import hashlib
-import six
+import platform
 import functools
-import logging
 
 member_dict = collections.OrderedDict()
 
 visited_modules = set()
 
-# APIs that should not be printed into API.spec 
-omitted_list = [
-    "paddle.fluid.LoDTensor.set",  # Do not know why it should be omitted
-    "paddle.fluid.io.ComposeNotAligned",
-    "paddle.fluid.io.ComposeNotAligned.__init__",
-]
-
 
 def md5(doc):
     hash = hashlib.md5()
@@ -74,13 +66,28 @@ def format_spec(spec):
 
 
 def queue_dict(member, cur_name):
-    if cur_name in omitted_list:
-        return
-
-    doc_md5 = md5(member.__doc__)
-
-    if inspect.isclass(member):
+    if cur_name != 'paddle':
+        try:
+            eval(cur_name)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it.".format(
+                    str(e), cur_name),
+                file=sys.stderr)
+            return
+
+    if (inspect.isclass(member) or inspect.isfunction(member) or
+            inspect.ismethod(member)) and hasattr(
+                member, '__module__') and hasattr(member, '__name__'):
         args = member.__module__ + "." + member.__name__
+        try:
+            eval(args)
+        except (AttributeError, NameError, SyntaxError) as e:
+            print(
+                "Error({}) occurred when `eval({})`, discard it for {}.".format(
+                    str(e), args, cur_name),
+                file=sys.stderr)
+            return
     else:
         try:
             args = inspect.getargspec(member)
@@ -95,6 +102,7 @@ def queue_dict(member, cur_name):
         if not has_type_error:
             args = format_spec(args)
 
+    doc_md5 = md5(member.__doc__)
     member_dict[cur_name] = "({}, ('document', '{}'))".format(args, doc_md5)
 
 
@@ -106,8 +114,7 @@ def visit_member(parent_name, member, member_name=None):
     if inspect.isclass(member):
         queue_dict(member, cur_name)
         for name, value in inspect.getmembers(member):
-            if hasattr(value, '__name__') and (not name.startswith("_") or
-                                               name == "__init__"):
+            if hasattr(value, '__name__') and not name.startswith("_"):
                 visit_member(cur_name, value)
     elif inspect.ismethoddescriptor(member):
         return
@@ -123,7 +130,7 @@ def visit_member(parent_name, member, member_name=None):
 
 
 def is_primitive(instance):
-    int_types = (int, long) if six.PY2 else (int, )
+    int_types = (int, long) if platform.python_version()[0] == "2" else (int, )
     pritimitive_types = int_types + (float, str)
     if isinstance(instance, pritimitive_types):
         return True
@@ -149,11 +156,14 @@ def visit_all_module(mod):
         return
 
     visited_modules.add(mod)
-
-    for member_name in (
-            name
-            for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod))
-            if not name.startswith("_")):
+    if hasattr(mod, "__all__"):
+        member_names = (name for name in mod.__all__
+                        if not name.startswith("_"))
+    elif mod_name == 'paddle':
+        member_names = dir(mod)
+    else:
+        return
+    for member_name in member_names:
         instance = getattr(mod, member_name, None)
         if instance is None:
             continue
@@ -168,17 +178,20 @@ def visit_all_module(mod):
             visit_all_module(instance)
         else:
             if member_name != instance.__name__:
-                logging.warn(
+                print(
                     "Found alias API, alias name is: {}, original name is: {}".
-                    format(member_name, instance.__name__))
+                    format(member_name, instance.__name__),
+                    file=sys.stderr)
                 visit_member(mod.__name__, instance, member_name)
             else:
                 visit_member(mod.__name__, instance)
 
 
-modules = sys.argv[1].split(",")
-for m in modules:
-    visit_all_module(importlib.import_module(m))
+if __name__ == '__main__':
+    import paddle
+    modules = sys.argv[1].split(",")
+    for m in modules:
+        visit_all_module(importlib.import_module(m))
 
-for name in member_dict:
-    print(name, member_dict[name])
+    for name in member_dict:
+        print(name, member_dict[name])
diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py
index a8a717da027e0..52777cd59ba25 100644
--- a/tools/sampcd_processor.py
+++ b/tools/sampcd_processor.py
@@ -19,8 +19,6 @@
 import math
 import platform
 import inspect
-import paddle
-import paddle.fluid
 import json
 import argparse
 import shutil
@@ -28,8 +26,8 @@
 import logging
 """
 please make sure to run in the tools path
-usage: python sample_test.py {arg1} 
-arg1: the first arg defined running in gpu version or cpu version
+usage: python sample_test.py {cpu or gpu} 
+    {cpu or gpu}: running in cpu version or gpu version
 
 for example, you can run cpu version python2 testing like this:
 
@@ -44,9 +42,7 @@
 else:
     console = logging.StreamHandler()
     logger.addHandler(console)
-console.setFormatter(
-    logging.Formatter(
-        "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s"))
+console.setFormatter(logging.Formatter("%(message)s"))
 
 RUN_ON_DEVICE = 'cpu'
 GPU_ID = 0
@@ -107,11 +103,9 @@ def check_indent(cdline):
     return indent
 
 
-# srccom: raw comments in the source,including ''' and original indent
-def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
+def sampcd_extract_to_file(srccom, name, htype="def", hname=""):
     """
-    Extract and run sample codes from source comment and
-    the result will be returned.
+    Extract sample codes from __doc__, and write them to files.
 
     Args:
         srccom(str): the source comment of some API whose
@@ -121,35 +115,12 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""):
         hname(str): the name of the hint  banners , e.t. def hname.
 
     Returns:
-        result: True or False
-        name(str): the name of the API.
-        msg(str): messages
+        sample_code_filenames(list of str)
     """
     global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR
+    CODE_BLOCK_INTERDUCTORY = "code-block:: python"
 
-    result = True
-    msg = None
-
-    def sampcd_header_print(name, sampcd, htype, hname):
-        """
-        print hint banner headers.
-
-        Args:
-            name(str): the name of the API.
-            sampcd(str): sample code string
-            htype(str): the type of hint banners, def/class/method.
-            hname(str): the name of the hint  banners , e.t. def hname.
-            flushed.
-        """
-        print(htype, " name:", hname)
-        print("-----------------------")
-        print("Sample code ", str(y), " extracted for ", name, "   :")
-        print(sampcd)
-        print("----example code check----\n")
-        print("executing sample code .....")
-        print("execution result:")
-
-    sampcd_begins = find_all(srccom, " code-block:: python")
+    sampcd_begins = find_all(srccom, CODE_BLOCK_INTERDUCTORY)
     if len(sampcd_begins) == 0:
         # detect sample codes using >>> to format and consider this situation as wrong
         print(htype, " name:", hname)
@@ -161,14 +132,14 @@ def sampcd_header_print(name, sampcd, htype, hname):
                     "Deprecated sample code style:\n\n    Examples:\n\n        >>>codeline\n        >>>codeline\n\n\n ",
                     "Please use '.. code-block:: python' to ",
                     "format sample code.\n")
-                result = False
+                return []
         else:
             print("Error: No sample code!\n")
-            result = False
-
+            return []
+    sample_code_filenames = []
     for y in range(1, len(sampcd_begins) + 1):
         sampcd_begin = sampcd_begins[y - 1]
-        sampcd = srccom[sampcd_begin + len(" code-block:: python") + 1:]
+        sampcd = srccom[sampcd_begin + len(CODE_BLOCK_INTERDUCTORY) + 1:]
         sampcd = sampcd.split("\n")
         # remove starting empty lines
         while sampcd[0].replace(' ', '').replace('\t', '') == '':
@@ -200,361 +171,90 @@ def sampcd_header_print(name, sampcd, htype, hname):
 
         tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format(
             name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y)))
-        logging.info('running %s', tfname)
         with open(tfname, 'w') as tempf:
             tempf.write(sampcd)
-        if platform.python_version()[0] == "2":
-            cmd = ["python", tfname]
-        elif platform.python_version()[0] == "3":
-            cmd = ["python3", tfname]
-        else:
-            print("Error: fail to parse python version!")
-            result = False
-            exit(1)
-
-        subprc = subprocess.Popen(
-            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = subprc.communicate()
-        msg = "".join(output.decode(encoding='utf-8'))
-        err = "".join(error.decode(encoding='utf-8'))
+        sample_code_filenames.append(tfname)
+    return sample_code_filenames
 
-        if subprc.returncode != 0:
-            print("\nSample code error found in ", name, ":\n")
-            sampcd_header_print(name, sampcd, htype, hname)
-            print("subprocess return code: ", str(subprc.returncode))
-            print("Error Raised from Sample Code ", name, " :\n")
-            print(err)
-            print(msg)
-            logging.warning('%s error: %s', tfname, err)
-            logging.warning('%s msg: %s', tfname, msg)
-            result = False
-        # msg is the returned code execution report
 
-    return result, name, msg
-
-
-def single_defcom_extract(start_from, srcls, is_class_begin=False):
+def execute_samplecode(tfname):
     """
-    to extract a def function/class/method comments body
+    Execute a sample-code test.
 
     Args:
-        start_from(int): the line num of "def" header
-        srcls(list): the source file in lines
-        is_class_begin(bool): whether the start_from is a beginning a class. \
-        For a sole class body itself may end up with its method if it has no
-        docstring. But the body of \
-        a common def function can only be ended up by a none-indented def/class
-
+        tfname: the filename of the samplecode.
+    
     Returns:
-        string : the extracted comment body, inclusive of its quote marks.
-
-    """
-
-    i = start_from
-    fcombody = ""  # def comment body
-    comstart = -1  # the starting line index of comment mark "'''" or """"""
-    # if it is not -1, it indicates the loop is in the comment body
-    comstyle = 0  # comment mark style ,comments quoted with ''' is coded as 1
-    # comments quoted with """ is coded as 2
-    for x in range(i + 1, len(srcls)):
-        if is_class_begin:
-            if srcls[x].replace('\t', '    ').startswith('    def '):
-                break
-        if srcls[x].startswith('def ') or srcls[x].startswith('class '):
-            break
-        else:
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\"\"\"") or s.startswith("r\"\"\""):
-                    comstart = x
-                    comstyle = 2
-                    continue
-            if (comstyle == 2 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\"\"\"")):
-                break
-            if comstart == -1:
-                s = srcls[x].replace(" ", '').replace("\t",
-                                                      '').replace("\n", '')
-                if s.startswith("\'\'\'") or s.startswith("r\'\'\'"):
-                    comstart = x
-                    comstyle = 1
-                    continue
-            if (comstyle == 1 and comstart != -1 and
-                    srcls[x].replace(" ", '').replace("\t", '').replace(
-                        "\n", '').startswith("\'\'\'")):
-                break
-            if (comstart !=
-                    -1):  # when the comments start, begin to add line to fcombody
-                fcombody += srcls[x]
-    return fcombody
-
-
-def srccoms_extract(srcfile, wlist, methods):
+        result: success or not
+        tfname: same as the input argument
+        msg: the stdout output of the samplecode executing.
     """
-    Given a source file ``srcfile``, this function will
-    extract its API(doc comments) and run sample codes in the
-    API.
-
-    Args:
-        srcfile(file): the source file
-        wlist(list): white list
-        methods(list): only elements of this list considered.
-
-    Returns:
-        result: True or False
-        error_methods: the methods that failed.
-    """
-
-    process_result = True
-    error_methods = []
-    srcc = srcfile.read()
-    # 2. get defs and classes header line number
-    # set file pointer to its beginning
-    srcfile.seek(0, 0)
-    srcls = srcfile.readlines()  # source lines
-
-    # 1. fetch__all__ list
-    allidx = srcc.find("__all__")
-    logger.debug('processing %s, methods: %s', srcfile.name, str(methods))
-    srcfile_new, _ = os.path.splitext(srcfile.name)
-    srcfile_list = srcfile_new.split('/')
-    srcfile_str = ''
-    for i in range(4, len(srcfile_list)):
-        srcfile_str = srcfile_str + srcfile_list[i] + '.'
-    if allidx != -1:
-        alllist = []
-        # get all list for layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for ai in range(0, len(srcls)):
-                if srcls[ai].startswith("__all__"):
-                    lb = srcls[ai].find('[')
-                    rb = srcls[ai].find(']')
-                    if lb == -1:
-                        continue
-                    allele = srcls[ai][lb + 1:rb].replace("'", '').replace(
-                        " ", '').replace("\"", '')
-                    alllist.append(allele)
-            if '' in alllist:
-                alllist.remove('')
-        else:
-            alllist_b = allidx + len("__all__")
-            allstr = srcc[alllist_b + srcc[alllist_b:].find("[") + 1:alllist_b +
-                          srcc[alllist_b:].find("]")]
-            allstr = allstr.replace("\n", '').replace(" ", '').replace(
-                "'", '').replace("\"", '')
-            alllist = allstr.split(',')
-            if '' in alllist:
-                alllist.remove('')
-        api_alllist_count = len(alllist)
-        logger.debug('found %d items: %s', api_alllist_count, str(alllist))
-        api_count = 0
-        handled = []
-        # get src contents in layers/ops.py
-        if srcfile.name.find("ops.py") != -1:
-            for i in range(0, len(srcls)):
-                opname = None
-                opres = re.match(r"^(\w+)\.__doc__", srcls[i])
-                if opres is not None:
-                    opname = opres.group(1)
-                else:
-                    opres = re.match(
-                        r"^add_sample_code\(globals\(\)\[\"(\w+)\"\]", srcls[i])
-                    if opres is not None:
-                        opname = opres.group(1)
-                if opname is not None:
-                    if opname in wlist:
-                        logger.info('%s is in the whitelist, skip it.', opname)
-                        continue
-                    else:
-                        logger.debug('%s\'s docstring found.', opname)
-                    comstart = i
-                    for j in range(i, len(srcls)):
-                        if srcls[j].find("\"\"\"") != -1:
-                            comstart = i
-                    opcom = ""
-                    for j in range(comstart + 1, len(srcls)):
-                        opcom += srcls[j]
-                        if srcls[j].find("\"\"\"") != -1:
-                            break
-                    result, _, _ = sampcd_extract_and_run(opcom, opname, "def",
-                                                          opname)
-                    if not result:
-                        error_methods.append(opname)
-                        process_result = False
-                    api_count += 1
-                    handled.append(
-                        opname)  # ops.py also has normal formatted functions
-                    # use list 'handled'  to mark the functions have been handled here
-                    # which will be ignored in the following step
-                    # handled what?
-        logger.debug('%s already handled.', str(handled))
-        for i in range(0, len(srcls)):
-            if srcls[i].startswith(
-                    'def '):  # a function header is detected in line i
-                f_header = srcls[i].replace(" ", '')
-                fn = f_header[len('def'):f_header.find('(')]  # function name
-                if "%s%s" % (srcfile_str, fn) not in methods:
-                    logger.info(
-                        '[file:%s, function:%s] not in methods list, skip it.',
-                        srcfile_str, fn)
-                    continue
-                if fn in handled:
-                    continue
-                if fn in alllist:
-                    api_count += 1
-                    if fn in wlist or fn + "@" + srcfile.name in wlist:
-                        logger.info('[file:%s, function:%s] skip by wlist.',
-                                    srcfile_str, fn)
-                        continue
-                    fcombody = single_defcom_extract(i, srcls)
-                    if fcombody == "":  # if no comment
-                        print("def name:", fn)
-                        print("-----------------------")
-                        print("WARNING: no comments in function ", fn,
-                              ", but it deserves.")
-                        continue
-                    else:
-                        result, _, _ = sampcd_extract_and_run(fcombody, fn,
-                                                              "def", fn)
-                        if not result:
-                            error_methods.append(fn)
-                            process_result = False
-
-            if srcls[i].startswith('class '):
-                c_header = srcls[i].replace(" ", '')
-                cn = c_header[len('class'):c_header.find('(')]  # class name
-                if '%s%s' % (srcfile_str, cn) not in methods:
-                    logger.info(
-                        '[file:%s, class:%s] not in methods list, skip it.',
-                        srcfile_str, cn)
-                    continue
-                if cn in handled:
-                    continue
-                if cn in alllist:
-                    api_count += 1
-                    if cn in wlist or cn + "@" + srcfile.name in wlist:
-                        logger.info('[file:%s, class:%s] skip by wlist.',
-                                    srcfile_str, cn)
-                        continue
-                    # class comment
-                    classcom = single_defcom_extract(i, srcls, True)
-                    if classcom != "":
-                        result, _, _ = sampcd_extract_and_run(classcom, cn,
-                                                              "class", cn)
-                        if not result:
-                            error_methods.append(cn)
-                            process_result = False
-                    else:
-                        print("WARNING: no comments in class itself ", cn,
-                              ", but it deserves.\n")
-                    # handling methods in class bodies
-                    for x in range(
-                            i + 1,
-                            len(srcls)):  # from the next line of class header
-                        if (srcls[x].startswith('def ') or
-                                srcls[x].startswith('class ')):
-                            break
-                        else:
-                            # member method def header
-                            srcls[x] = srcls[x].replace('\t', '    ')
-                            if (srcls[x].startswith(
-                                    '    def ')):  # detect a mehtod header..
-                                thisl = srcls[x]
-                                indent = len(thisl) - len(thisl.lstrip())
-                                mn = thisl[indent + len('def '):thisl.find(
-                                    '(')]  # method name
-                                name = cn + "." + mn  # full name
-                                if '%s%s' % (
-                                        srcfile_str, name
-                                ) not in methods:  # class method not in api.spec 
-                                    logger.info(
-                                        '[file:%s, func:%s] not in methods, skip it.',
-                                        srcfile_str, name)
-                                    continue
-                                if mn.startswith('_'):
-                                    logger.info(
-                                        '[file:%s, func:%s] startswith _, it\'s private method, skip it.',
-                                        srcfile_str, name)
-                                    continue
-                                if name in wlist or name + "@" + srcfile.name in wlist:
-                                    logger.info(
-                                        '[file:%s, class:%s] skip by wlist.',
-                                        srcfile_str, name)
-                                    continue
-                                thismethod = [thisl[indent:]
-                                              ]  # method body lines
-                                # get all the lines of a single method body
-                                # into thismethod(list)
-                                # and send it to single_defcom_extract
-                                for y in range(x + 1, len(srcls)):
-                                    srcls[y] = srcls[y].replace('\t', '    ')
-                                    if (srcls[y].startswith('def ') or
-                                            srcls[y].startswith('class ')):
-                                        # end of method
-                                        break
-                                    elif srcls[y].startswith('    def '):
-                                        # end of method
-                                        break
-                                    else:
-                                        thismethod.append(srcls[y][indent:])
-                                thismtdcom = single_defcom_extract(0,
-                                                                   thismethod)
-                                if thismtdcom != "":
-                                    result, _, _ = sampcd_extract_and_run(
-                                        thismtdcom, name, "method", name)
-                                    if not result:
-                                        error_methods.append(name)
-                                        process_result = False
+    result = True
+    msg = None
+    if platform.python_version()[0] in ["2", "3"]:
+        cmd = [sys.executable, tfname]
     else:
-        logger.warning('__all__ not found in file:%s', srcfile.name)
-
-    return process_result, error_methods
-
+        print("Error: fail to parse python version!")
+        result = False
+        exit(1)
 
-def test(file_list):
-    global methods  # readonly
-    process_result = True
-    for file in file_list:
-        with open(file, 'r') as src:
-            if not srccoms_extract(src, wlist, methods):
-                process_result = False
-    return process_result
+    # check required envisonment
+    with open(tfname, 'r') as f:
+        for line in f.readlines():
+            if re.match(r'#\s*required\s*:\s*(distributed|gpu|skip)', line):
+                result = True
+                return result, tfname, '{} is skipped. cause: {}'.format(tfname,
+                                                                         line)
+
+    logging.info('running %s', tfname)
+    print("\n----example code check----")
+    print("executing sample code .....", tfname)
+    subprc = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, error = subprc.communicate()
+    msg = "".join(output.decode(encoding='utf-8'))
+    err = "".join(error.decode(encoding='utf-8'))
+
+    if subprc.returncode != 0:
+        print("Sample code error found in ", tfname, ":")
+        print("-----------------------")
+        print(open(tfname).read())
+        print("-----------------------")
+        print("subprocess return code: ", str(subprc.returncode))
+        print("Error Raised from Sample Code ", tfname, " :")
+        print(err)
+        print(msg)
+        print("----example code check failed----\n")
+        logging.warning('%s error: %s', tfname, err)
+        logging.warning('%s msg: %s', tfname, msg)
+        result = False
+    else:
+        print("----example code check success----\n")
 
-
-def run_a_test(tc_filename):
-    """
-    execute a sample code-block.
-    """
-    global methods  # readonly
-    process_result = True
-    with open(tc_filename, 'r') as src:
-        process_result, error_methods = srccoms_extract(src, wlist, methods)
-    return process_result, tc_filename, error_methods
+    # msg is the returned code execution report
+    return result, tfname, msg
 
 
 def get_filenames():
     '''
-    this function will get the modules that pending for check.
+    this function will get the sample code files that pending for check.
 
     Returns:
 
-        list: the modules pending for check .
+        dict: the sample code files pending for check .
 
     '''
-    filenames = []
     global methods  # write
     global whl_error
-    methods = []
+    import paddle
     whl_error = []
     get_incrementapi()
-    API_spec = API_DIFF_SPEC_FN
-    with open(API_spec) as f:
+    all_sample_code_filenames = {}
+    with open(API_DIFF_SPEC_FN) as f:
         for line in f.readlines():
             api = line.replace('\n', '')
             try:
-                module = eval(api).__module__
+                api_obj = eval(api)
             except AttributeError:
                 whl_error.append(api)
                 continue
@@ -562,50 +262,24 @@ def get_filenames():
                 logger.warning('line:%s, api:%s', line, api)
                 # paddle.Tensor.<lambda>
                 continue
-            if len(module.split('.')) > 1:
-                filename = '../python/'
-                # work for .so?
-                module_py = '%s.py' % module.split('.')[-1]
-                for i in range(0, len(module.split('.')) - 1):
-                    filename = filename + '%s/' % module.split('.')[i]
-                filename = filename + module_py
-            else:
-                filename = ''
-                logger.warning("WARNING: Exception in getting api:%s module:%s",
-                               api, module)
-            if filename in filenames:
-                continue
-            elif not filename:
-                logger.warning('filename invalid: %s', line)
-                continue
-            elif not os.path.exists(filename):
-                logger.warning('file not exists: %s', filename)
-                continue
-            else:
-                filenames.append(filename)
-            # get all methods
-            method = ''
-            if inspect.isclass(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.isfunction(eval(api)):
-                name = api.split('.')[-1]
-            elif inspect.ismethod(eval(api)):
-                name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1])
-            else:
-                name = ''
-                logger.warning(
-                    "WARNING: Exception when getting api:%s, line:%s", api,
-                    line)
-            for j in range(2, len(module.split('.'))):
-                method = method + '%s.' % module.split('.')[j]
-            method = method + name
-            if method not in methods:
-                methods.append(method)
-    os.remove(API_spec)
-    return filenames
+            if hasattr(api_obj, '__doc__') and api_obj.__doc__:
+                sample_code_filenames = sampcd_extract_to_file(api_obj.__doc__,
+                                                               api)
+                for tfname in sample_code_filenames:
+                    all_sample_code_filenames[tfname] = api
+    return all_sample_code_filenames
 
 
 def get_api_md5(path):
+    """
+    read the api spec file, and scratch the md5sum value of every api's docstring.
+
+    Args:
+        path: the api spec file. ATTENTION the path relative
+    
+    Returns:
+        api_md5(dict): key is the api's real fullname, value is the md5sum.
+    """
     api_md5 = {}
     API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")),
                           path)
@@ -744,22 +418,13 @@ def parse_args():
     if len(filenames) == 0 and len(whl_error) == 0:
         logger.info("-----API_PR.spec is the same as API_DEV.spec-----")
         exit(0)
-    rm_file = []
-    for f in filenames:
-        for w_file in wlist_file:
-            if f.startswith(w_file):
-                rm_file.append(f)
-                filenames.remove(f)
-    if len(rm_file) != 0:
-        logger.info("REMOVE white files: %s", rm_file)
     logger.info("API_PR is diff from API_DEV: %s", filenames)
 
     threads = multiprocessing.cpu_count()
     if args.threads:
         threads = args.threads
     po = multiprocessing.Pool(threads)
-    # results = po.map_async(test, divided_file_list)
-    results = po.map_async(run_a_test, filenames)
+    results = po.map_async(execute_samplecode, filenames.keys())
     po.close()
     po.join()
 
diff --git a/tools/test_print_signatures.py b/tools/test_print_signatures.py
new file mode 100644
index 0000000000000..7cbdbb56cb1b1
--- /dev/null
+++ b/tools/test_print_signatures.py
@@ -0,0 +1,95 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TestCases for print_signatures.py
+
+sample lines from API_DEV.spec:
+    paddle.autograd.backward (ArgSpec(args=['tensors', 'grad_tensors', 'retain_graph'], varargs=None, keywords=None, defaults=(None, False)), ('document', '33a4434c9d123331499334fbe0274870'))
+    paddle.autograd.PyLayer (paddle.autograd.py_layer.PyLayer, ('document', 'c26adbbf5f1eb43d16d4a399242c979e'))
+    paddle.autograd.PyLayer.apply (ArgSpec(args=['cls'], varargs=args, keywords=kwargs, defaults=None), ('document', 'cb78696dc032fb8af2cba8504153154d'))
+"""
+import unittest
+import hashlib
+import inspect
+import functools
+from print_signatures import md5
+from print_signatures import get_functools_partial_spec
+from print_signatures import format_spec
+from print_signatures import queue_dict
+from print_signatures import member_dict
+
+
+def func_example(param_a, param_b):
+    """
+    example function
+    """
+    pass
+
+
+def func_example_2(func=functools.partial(func_example, 1)):
+    """
+    example function 2
+    """
+    pass
+
+
+class ClassExample():
+    """
+    example Class
+    """
+
+    def example_method(self):
+        """
+        class method
+        """
+        pass
+
+
+class Test_all_in_print_signatures(unittest.TestCase):
+    def test_md5(self):
+        algo = hashlib.md5()
+        algo.update(func_example.__doc__.encode('utf-8'))
+        digest = algo.hexdigest()
+        self.assertEqual(digest, md5(func_example.__doc__))
+
+    def test_get_functools_partial_spec(self):
+        partailed_func = functools.partial(func_example, 1)
+        # args = inspect.getargspec(partailed_func)
+        self.assertEqual('func_example(args=(1,), keywords={})',
+                         get_functools_partial_spec(partailed_func))
+
+
+class Test_format_spec(unittest.TestCase):
+    def test_normal_func_spec(self):
+        args = inspect.getargspec(func_example)
+        self.assertEqual(
+            '''ArgSpec(args=['param_a', 'param_b'], varargs=None, keywords=None, defaults=None)''',
+            format_spec(args))
+
+    def test_func_spec_with_partialedfunc_as_param_default(self):
+        # but there is no function belongs to this type in API_DEV.spec
+        args = inspect.getargspec(func_example_2)
+        self.assertEqual(
+            '''ArgSpec(args=['func'], varargs=None, keywords=None, defaults=('func_example(args=(1,), keywords={})',))''',
+            format_spec(args))
+
+
+class Test_queue_dict(unittest.TestCase):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py
index f8bcb662e879b..7836728247f50 100644
--- a/tools/test_sampcd_processor.py
+++ b/tools/test_sampcd_processor.py
@@ -22,12 +22,13 @@
 import importlib
 from sampcd_processor import find_all
 from sampcd_processor import check_indent
-from sampcd_processor import sampcd_extract_and_run
-from sampcd_processor import single_defcom_extract
-from sampcd_processor import srccoms_extract
 from sampcd_processor import get_api_md5
 from sampcd_processor import get_incrementapi
 from sampcd_processor import get_wlist
+from sampcd_processor import sampcd_extract_to_file
+from sampcd_processor import execute_samplecode
+
+SAMPLECODE_TEMP_DIR = 'samplecode_temp'
 
 
 class Test_find_all(unittest.TestCase):
@@ -53,107 +54,95 @@ def test_indent_1_tab(self):
         self.assertEqual(4, check_indent("\thello paddle"))
 
 
-class Test_sampcd_extract_and_run(unittest.TestCase):
+class Test_execute_samplecode(unittest.TestCase):
+    def setUp(self):
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
+        self.successSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                  'samplecode_success.py')
+        with open(self.successSampleCodeFile, 'w') as f:
+            f.write('print(1+1)')
+        self.failedSampleCodeFile = os.path.join(SAMPLECODE_TEMP_DIR,
+                                                 'samplecode_failed.py')
+        with open(self.failedSampleCodeFile, 'w') as f:
+            f.write('print(1/0)')
+
+    def tearDown(self):
+        os.remove(self.successSampleCodeFile)
+        os.remove(self.failedSampleCodeFile)
+
+    def test_run_success(self):
+        result, tfname, msg = execute_samplecode(self.successSampleCodeFile)
+        self.assertTrue(result)
+        self.assertEqual(self.successSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_run_failed(self):
+        result, tfname, msg = execute_samplecode(self.failedSampleCodeFile)
+        self.assertFalse(result)
+        self.assertEqual(self.failedSampleCodeFile, tfname)
+        self.assertIsNotNone(msg)
+        self.assertLess(msg.find('skipped'), 0)
+
+    def test_testcases_skipped(self):
+        ...
+        tfname = os.path.join(SAMPLECODE_TEMP_DIR, 'samplecode_skipped.py')
+        with open(tfname, 'w') as f:
+            f.write("# required: distributed\nprint(1/0)")
+        result, _, msg = execute_samplecode(tfname)
+        self.assertTrue(result)
+        self.assertGreaterEqual(msg.find('skipped'), 0)
+        os.remove(tfname)
+
+
+class Test_sampcd_extract_to_file(unittest.TestCase):
     def setUp(self):
-        if not os.path.exists('samplecode_temp/'):
-            os.mkdir('samplecode_temp/')
+        if not os.path.exists(SAMPLECODE_TEMP_DIR):
+            os.mkdir(SAMPLECODE_TEMP_DIR)
 
-    def test_run_a_defs_samplecode(self):
+    def tearDown(self):
+        shutil.rmtree(SAMPLECODE_TEMP_DIR)
+
+    def test_1_samplecode(self):
         comments = """
         Examples:
             .. code-block:: python
+
                 print(1+1)
         """
         funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertTrue(res)
-        self.assertEqual(funcname, name)
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual(
+            [os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example.py')],
+            sample_code_filenames)
 
-    def test_run_a_def_no_code(self):
+    def test_no_samplecode(self):
         comments = """
         placeholder
         """
         funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertFalse(res)
-        self.assertEqual(funcname, name)
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([], sample_code_filenames)
 
-    def test_run_a_def_raise_expection(self):
+    def test_2_samplecodes(self):
         comments = """
         placeholder
         Examples:
             .. code-block:: python
-                print(1/0)
-        """
-        funcname = 'one_plus_one'
-        res, name, msg = sampcd_extract_and_run(comments, funcname)
-        self.assertFalse(res)
-        self.assertEqual(funcname, name)
 
+                print(1/0)
 
-class Test_single_defcom_extract(unittest.TestCase):
-    def test_extract_from_func(self):
-        defstr = '''
-import os
-def foo():
-            """
-            foo is a function.
-            """
-            pass
-def bar():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=False)
-        self.assertEqual("            foo is a function.\n", comm)
-        pass
-
-    def test_extract_from_func_with_no_docstring(self):
-        defstr = '''
-import os
-def bar():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=False)
-        self.assertEqual('', comm)
-        pass
-
-    def test_extract_from_class(self):
-        defstr = r'''
-import os
-class Foo():
-            """
-            Foo is a class.
-            second line.
-            """
-            pass
-            def bar():
-                pass
-def foo():
-            pass
-'''
-        comm = single_defcom_extract(
-            2, defstr.splitlines(True), is_class_begin=True)
-        rcomm = """            Foo is a class.
-            second line.
-"""
-        self.assertEqual(rcomm, comm)
-        pass
+            .. code-block:: python
 
-    def test_extract_from_class_with_no_docstring(self):
-        defstr = '''
-import os
-class Foo():
-            pass
-            def bar():
-                pass
-def foo():
-            pass
-'''
-        comm = single_defcom_extract(
-            0, defstr.splitlines(True), is_class_begin=True)
-        self.assertEqual('', comm)
+                print(1+1)
+        """
+        funcname = 'one_plus_one'
+        sample_code_filenames = sampcd_extract_to_file(comments, funcname)
+        self.assertCountEqual([
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_1.py'),
+            os.path.join(SAMPLECODE_TEMP_DIR, funcname + '_example_2.py')
+        ], sample_code_filenames)
 
 
 class Test_get_api_md5(unittest.TestCase):
@@ -268,174 +257,6 @@ def test_get_wlist(self):
         self.assertCountEqual(["deformable_conv"], gpu_not_white)
 
 
-class Test_srccoms_extract(unittest.TestCase):
-    def setUp(self):
-        self.tmpDir = tempfile.mkdtemp()
-        sys.path.append(self.tmpDir)
-        self.api_pr_spec_filename = os.path.abspath(
-            os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec'))
-        with open(self.api_pr_spec_filename, 'w') as f:
-            f.write("\n".join([
-                """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "one_plus_one"))""",
-                """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "two_plus_two"))""",
-                """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "three_plus_three"))""",
-                """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "four_plus_four"))""",
-            ]))
-
-    def tearDown(self):
-        sys.path.remove(self.tmpDir)
-        shutil.rmtree(self.tmpDir)
-        os.remove(self.api_pr_spec_filename)
-
-    def test_from_ops_py(self):
-        filecont = '''
-def add_sample_code(obj, docstr):
-    pass
-
-__unary_func__ = [
-    'exp',
-]
-
-__all__ = []
-__all__ += __unary_func__
-__all__ += ['one_plus_one']
-
-def exp():
-    pass
-add_sample_code(globals()["exp"], r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.exp(x)
-        print(out)
-        # [0.67032005 0.81873075 1.10517092 1.34985881]
-""")
-
-def one_plus_one():
-            return 1+1
-
-one_plus_one.__doc__ = """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(1+1)
-"""
-
-__all__ += ['two_plus_two']
-def two_plus_two():
-            return 2+2
-add_sample_code(globals()["two_plus_two"], """
-            Examples:
-            .. code-block:: python
-                print(2+2)
-""")
-'''
-        pyfilename = os.path.join(self.tmpDir, 'ops.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        self.assertTrue(os.path.exists(pyfilename))
-        utsp = importlib.import_module('ops')
-        print('testing srccoms_extract from ops.py')
-        methods = ['one_plus_one', 'two_plus_two', 'exp']
-        # os.remove("samplecode_temp/" "one_plus_one_example.py")
-        self.assertFalse(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, [], methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        os.remove("samplecode_temp/" "one_plus_one_example.py")
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "two_plus_two_example.py"))
-        os.remove("samplecode_temp/" "two_plus_two_example.py")
-        self.assertTrue(os.path.exists("samplecode_temp/" "exp_example.py"))
-        os.remove("samplecode_temp/" "exp_example.py")
-
-    def test_from_not_ops_py(self):
-        filecont = '''
-__all__ = [
-        'one_plus_one'
-]
-
-def one_plus_one():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(1+1)
-            """
-            return 1+1
-
-'''
-        pyfilename = os.path.join(self.tmpDir, 'opo.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        utsp = importlib.import_module('opo')
-        methods = ['one_plus_one']
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, [], methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/"
-                           "one_plus_one_example.py"))
-        os.remove("samplecode_temp/" "one_plus_one_example.py")
-
-    def test_with_empty_wlist(self):
-        """
-        see test_from_ops_py
-        """
-        pass
-
-    def test_with_wlist(self):
-        filecont = '''
-__all__ = [
-        'four_plus_four',
-        'three_plus_three'
-        ]
-
-def four_plus_four():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(4+4)
-            """
-            return 4+4
-def three_plus_three():
-            """
-            placeholder
-
-            Examples:
-            .. code-block:: python
-                print(3+3)
-            """
-            return 3+3
-
-'''
-        pyfilename = os.path.join(self.tmpDir, 'three_and_four.py')
-        with open(pyfilename, 'w') as pyfile:
-            pyfile.write(filecont)
-        utsp = importlib.import_module('three_and_four')
-        methods = ['four_plus_four', 'three_plus_three']
-        with open(pyfilename, 'r') as pyfile:
-            res, error_methods = srccoms_extract(pyfile, ['three_plus_three'],
-                                                 methods)
-            self.assertTrue(res)
-        self.assertTrue(
-            os.path.exists("samplecode_temp/four_plus_four_example.py"))
-        os.remove("samplecode_temp/" "four_plus_four_example.py")
-        self.assertFalse(
-            os.path.exists("samplecode_temp/three_plus_three_example.py"))
-
-
 # https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py
 # why? unabled to use the ast module. emmmmm
 
diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 736a19ddf52f4..4a61a99c34fa2 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -22,9 +22,9 @@
 ::   Include:
 ::     1. CMake 3.17.0
 ::     2. Git 2.28.0
-::     3. Python 3.7.8
-::     4. Visual Studio 2015 with update 3
-::     5. CUDA 10
+::     3. Python 3.8.3
+::     4. Visual Studio 2017 Community
+::     5. CUDA 11.2
 ::     6. java jre
 ::     7. xly agent
 
@@ -73,7 +73,6 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Cmake-3.17.0 failed, please re-install it manually.
 )
-del cmake-3.17.0-win64-x64.msi
 goto :eof
 :: ===== end step 1: cmake =====
 
@@ -99,91 +98,87 @@ if %errorlevel% == 0 (
 ) else (
   echo Error***** Install Git-2.28.0 failed, please re-install it manually.
 )
-del Git-2.28.0-64-bit.exe
 goto :eof
 :: ===== end step 2: Git =====
 
 :: ===== start step 3: Python =====
-:: Download Python-3.7.8 and add in PATH when it not installed.
-:: TODO: limit version >= 3.7.8
+:: Download Python-3.8.3 and add in PATH when it not installed.
+:: TODO: limit version >= 3.8.3
 :python
-echo ">>>>>>>> step [3/7]: Python 3.7.8"
-python -V 2>&1 | findstr /C:"Python 3.7.8" > nul 2> nul || call :install_python
-goto vs2015
+echo ">>>>>>>> step [3/7]: Python 3.8.3"
+python -V 2>&1 | findstr /C:"Python 3.8.3" > nul 2> nul || call :install_python
+goto vs
 
 :install_python
-echo There is not Python in this PC, will install Python-3.7.8.
-echo Download package from https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe ...
-wget -O python-3.7.8-amd64.exe https://npm.taobao.org/mirrors/python/3.7.8/python-3.7.8-amd64.exe
-echo Install Python-3.7.8 ...
+echo There is not Python in this PC, will install Python-3.8.3
+echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe ...
+wget -O python-3.8.3-amd64.exe https://paddle-ci.gz.bcebos.com/window_requirement/python-3.8.3-amd64.exe
+echo Install Python-3.8.3 ...
 :: /passive [silent install]
 :: InstallAllUsers [add path for all users]
 :: PrependPath [add script/install into PATH]
 :: TargetDir [install directory]
-start /wait python-3.7.8-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python37
+start /wait python-3.8.3-amd64.exe /passive InstallAllUsers=1 PrependPath=1 TargetDir=C:\Python38
 if %errorlevel% == 0 (
-  echo Install python-3.7.8 success!
+  echo Install python-3.8.3 success!
 ) else (
-  echo Error***** Install python-3.7.8 failed, please re-install it manually.
+  echo Error***** Install python-3.8.3 failed, please re-install it manually.
 )
-del python-3.7.8-amd64.exe
 goto :eof
 :: ===== end step 3: Python =====
 
-:: ===== start step 4: Visual Studio 2015 =====
-:: Download Visual Studio 2015 when it not installed.
-:vs2015
-echo ">>>>>>>> step [4/7]: Visual Studio 2015"
-cmd /C "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 > nul 2> nul || call :install_visual_studio
+:: ===== start step 4: Visual Studio 2017 Community =====
+:: Download Visual Studio 2017 when it not installed.
+:vs
+echo ">>>>>>>> step [4/7]: Visual Studio 2017 "
+cmd /C "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat"  > nul 2> nul || call :install_visual_studio
 goto :cuda10
 
 :install_visual_studio
-echo There is not Visual Studio in this PC, will install VS2015.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-wget -O vs_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/en_visual_studio_enterprise_2015_with_update_3_x86_x64_web_installer_8922986.exe"
-echo Install Visual Studio 2015 ...
+echo There is not Visual Studio in this PC, will install VS2017.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+wget -O vs_Community.exe "https://paddle-ci.gz.bcebos.com/window_requirement/VS2017/vs_Community.exe"
+echo Install Visual Studio 2017 ...
 :: /passive [silent install]
 :: /norestart [no restart]
 :: /NoRefresh [no refresh]
 :: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
-start /wait vs_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+start /wait vs_Community.exe --passive --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.Universal --includeRecommended
 if %errorlevel% == 0 (
-  echo Install Visual Studio 2015 success!
+  echo Install Visual Studio 2017 success!
 ) else (
-  echo Error***** Install Visual Studio 2015 failed, please re-install it manually.
+  echo Error***** Install Visual Studio 2017 failed, please re-install it manually.
 )
-del vs_installer.exe
 goto :eof
-:: ===== end step 4: Visual Studio 2015 =====
+:: ===== end step 4: Visual Studio 2017 =====
 
-:: ===== start step 5: CUDA 10 =====
+:: ===== start step 5: CUDA 11 =====
 :cuda10
-echo ">>>>>>>> step [5/7]: CUDA 10.2"
-cmd /C nvcc --version 2> nul | findstr /C:"10.2" > nul 2> nul || call :install_cuda
+echo ">>>>>>>> step [5/7]: CUDA 11.2"
+cmd /C nvcc --version 2> nul | findstr /C:"11.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
-echo There is not CUDA in this PC, will install CUDA-10.2.
-echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_10.2.89_441.22_win10.exe"
-echo Install CUDA-10.2 ...
+echo There is not CUDA in this PC, will install CUDA-11.2.
+echo Download package from "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+wget -O cuda_installer.exe "https://paddle-ci.gz.bcebos.com/window_requirement/cuda_11.2.0_460.89_win10.exe"
+echo Install CUDA-11.2 ...
 :: -s [silent install]
 start /wait cuda_installer.exe -s
 if %errorlevel% == 0 (
-  echo Install CUDA-10.2 success!
+  echo Install CUDA-11.2 success!
 ) else (
-  echo Error***** Install CUDA-10.2 failed, please re-install it manually.
+  echo Error***** Install CUDA-11.2 failed, please re-install it manually.
   goto :eof
 )
 del cuda_installer.exe
-echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
-tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
-xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
-xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
-xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+wget -O cudnn-11.2-windows-x64-v8.1.0.77.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-11.2-windows-x64-v8.1.0.77.zip"
+tar xf cudnn-11.2-windows-x64-v8.1.0.77.zip
+xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\bin"
+xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\include"
+xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.2\lib"
 rd /s /q cuda
-del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
 :: ===== end step 5: CUDA 10 =====
 
@@ -212,7 +207,7 @@ goto :eof
 :: ===== start step 7: xly agent =====
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
-wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
+wget -O agent.jar "https://xly.bce.baidu.com/sa_server/agent/v1/download?version=1.2.8"
 :: ===== end step 8: xly agent =====
 
 pause