diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 5009dddce2f14..2df86e86a75e0 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -79,7 +79,8 @@ void AppendSkipDeletionVars(const std::vector<std::string> &append_vars,
  *   2. it is an input var used in backward_op
  */
 void ParseSafeEagerDeletionSkipVars(
-    const ProgramDesc &program, int64_t forward_op_nums,
+    const ProgramDesc &program,
+    int64_t forward_op_nums,
     const std::vector<std::string> &output_var_names,
     std::vector<std::string> *skip_eager_delete_vars) {
   auto all_ops = program.Block(0).AllOps();
@@ -143,8 +144,11 @@ ExecutorInfoCache &ExecutorInfoCache::Instance() {
 }
 
 static PEAndGraphPair CreateExecutorInfo(
-    const ProgramDesc &program_desc, const platform::Place &place,
-    int64_t start_op_index, int64_t end_op_index, framework::Scope *scope,
+    const ProgramDesc &program_desc,
+    const platform::Place &place,
+    int64_t start_op_index,
+    int64_t end_op_index,
+    framework::Scope *scope,
     const details::BuildStrategy &build_strategy) {
   auto execution_strategy = details::GetExecutionStrategy(place);
   auto graph = std::make_shared<framework::ir::Graph>(
@@ -162,15 +166,17 @@ PEAndGraphPair CreateFixOrderExecutorInfo(const ProgramDesc &program_desc,
                                           framework::Scope *scope) {
   details::BuildStrategy build_strategy;
   build_strategy.fix_op_run_order_ = true;
-  auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index,
-                                         end_op_index, scope, build_strategy);
+  auto pe_and_graph = CreateExecutorInfo(
+      program_desc, place, start_op_index, end_op_index, scope, build_strategy);
   return pe_and_graph;
 }
 
 CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc,
                                    const platform::Place &place,
-                                   int64_t start_op_index, int64_t end_op_index,
-                                   bool is_grad, int64_t program_id,
+                                   int64_t start_op_index,
+                                   int64_t end_op_index,
+                                   bool is_grad,
+                                   int64_t program_id,
                                    framework::Scope *scope) {
   auto &cached_exe_info = framework::ExecutorInfoCache::Instance();
 
@@ -186,8 +192,12 @@ CacheInfo GetExecutorInfoFromCache(const ProgramDesc &program_desc,
     auto &build_strategy = cached_exe_info.GetBuildStrategy(program_id);
 
     // 2. Construct Graph and ParallelExecutor.
-    auto pe_and_graph = CreateExecutorInfo(program_desc, place, start_op_index,
-                                           end_op_index, scope, build_strategy);
+    auto pe_and_graph = CreateExecutorInfo(program_desc,
+                                           place,
+                                           start_op_index,
+                                           end_op_index,
+                                           scope,
+                                           build_strategy);
 
     // 3. Insert value into cached map.
     auto &cached_value = cached_exe_info.GetMutable(program_id, is_grad);
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index d62acb759868c..f28696194e5f6 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -115,21 +115,25 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
         auto type = std::string{"sgd"};
         // auto LearningRate = op->Input("LearningRate");
         auto use_nesterov = BOOST_GET_CONST(bool, op->GetAttr("use_nesterov"));
-        PADDLE_ENFORCE_EQ(use_nesterov, false,
+        PADDLE_ENFORCE_EQ(use_nesterov,
+                          false,
                           platform::errors::Unimplemented(
                               "ipu does not support nesterov mode."));
         auto regularization_method =
             BOOST_GET_CONST(std::string, op->GetAttr("regularization_method"));
-        PADDLE_ENFORCE_NE(regularization_method, "l1_decay",
+        PADDLE_ENFORCE_NE(regularization_method,
+                          "l1_decay",
                           platform::errors::Unimplemented(
                               "ipu does not support l1_decay mode."));
         auto multi_precision =
             BOOST_GET_CONST(bool, op->GetAttr("multi_precision"));
-        PADDLE_ENFORCE_EQ(multi_precision, false,
+        PADDLE_ENFORCE_EQ(multi_precision,
+                          false,
                           platform::errors::Unimplemented(
                               "ipu does not support multi_precision mode."));
         auto rescale_grad = BOOST_GET_CONST(float, op->GetAttr("rescale_grad"));
-        PADDLE_ENFORCE_EQ(rescale_grad, 1.0,
+        PADDLE_ENFORCE_EQ(rescale_grad,
+                          1.0,
                           platform::errors::Unimplemented(
                               "ipu does not support rescale_grad mode."));
         auto regularization_coeff =
@@ -150,10 +154,12 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
         auto lazy_mode = BOOST_GET_CONST(bool, op->GetAttr("lazy_mode"));
         auto multi_precision =
             BOOST_GET_CONST(bool, op->GetAttr("multi_precision"));
-        PADDLE_ENFORCE_EQ(lazy_mode, false,
+        PADDLE_ENFORCE_EQ(lazy_mode,
+                          false,
                           platform::errors::Unimplemented(
                               "ipu does not support lazy_mode mode."));
-        PADDLE_ENFORCE_EQ(multi_precision, false,
+        PADDLE_ENFORCE_EQ(multi_precision,
+                          false,
                           platform::errors::Unimplemented(
                               "ipu does not support multi_precision mode."));
         new_op.SetAttr("type", type);
@@ -268,11 +274,13 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       VLOG(10) << "found loss op type: " << op->Type();
       auto outputs = op->Outputs();
       PADDLE_ENFORCE_EQ(
-          outputs.size(), 1,
+          outputs.size(),
+          1,
           platform::errors::InvalidArgument("Can only support one loss key"));
       auto losses = outputs.begin()->second;
       PADDLE_ENFORCE_EQ(
-          losses.size(), 1,
+          losses.size(),
+          1,
           platform::errors::InvalidArgument("Can only support one loss name"));
       auto loss_var = losses.front();
       new_op.SetAttr("loss_var", loss_var);
@@ -282,11 +290,13 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
     } else if (op_type == "identity_loss") {
       auto outputs = op->Outputs();
       PADDLE_ENFORCE_EQ(
-          outputs.size(), 1,
+          outputs.size(),
+          1,
           platform::errors::InvalidArgument("Can only support one loss key"));
       auto losses = outputs.begin()->second;
       PADDLE_ENFORCE_EQ(
-          losses.size(), 1,
+          losses.size(),
+          1,
           platform::errors::InvalidArgument("Can only support one loss name"));
       auto loss_var = losses.front();
       new_op.SetAttr("loss_var", loss_var);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index f5340f08677e2..189328e95abac 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -55,11 +55,13 @@ DECLARE_bool(sync_nccl_allreduce);
 #include "gperftools/profiler.h"
 #endif
 PADDLE_DEFINE_EXPORTED_string(
-    pe_profile_fname, "",
+    pe_profile_fname,
+    "",
     "Profiler filename for PE, which generated by gperftools."
     "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
 PADDLE_DEFINE_EXPORTED_bool(
-    enable_parallel_graph, false,
+    enable_parallel_graph,
+    false,
     "Force disable parallel graph execution mode if set false.");
 
 namespace paddle {
@@ -166,8 +168,8 @@ class ParallelExecutorPrivate {
     std::vector<ncclUniqueId *> flat_nccl_ids;
     if (nranks_ == 1) {
       // FIXME(gongwb): need not to create ncclid when nranks==1
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      nccl_ctxs_->InitFlatCtxs(
+          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
       return;
     }
 
@@ -184,7 +186,8 @@ class ParallelExecutorPrivate {
       } else {
         nccl_id = new ncclUniqueId();
         PADDLE_ENFORCE_EQ(
-            platform::dynload::ncclGetUniqueId(nccl_id), ncclSuccess,
+            platform::dynload::ncclGetUniqueId(nccl_id),
+            ncclSuccess,
             platform::errors::PreconditionNotMet(
                 "PaddlePaddle failed to get NCCL unique ID. It may due to your "
                 "system settings or NCCL library error, please debug on NCCL"));
@@ -194,16 +197,16 @@ class ParallelExecutorPrivate {
 
       flat_nccl_ids.push_back(nccl_id);
 
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      nccl_ctxs_->InitFlatCtxs(
+          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
       VLOG(1) << "init bst nccl context complete!";
       return;
     }
 
     // num_trainers ==1 && places > 1
     if (bst.num_trainers_ == 1) {
-      nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      nccl_ctxs_->InitFlatCtxs(
+          places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
       return;
     }
 
@@ -217,8 +220,8 @@ class ParallelExecutorPrivate {
       flat_nccl_ids.push_back(nccl_id);
     }
 
-    nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_,
-                             bst.trainer_id_);
+    nccl_ctxs_->InitFlatCtxs(
+        places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
 
     if (bst.use_hierarchical_allreduce_) {
       std::vector<ncclUniqueId *> inter_nccl_ids;
@@ -244,8 +247,12 @@ class ParallelExecutorPrivate {
       }
 
       nccl_ctxs_->InitHierarchicalCtxs(
-          places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_,
-          bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_,
+          places_,
+          inter_nccl_ids,
+          exter_nccl_ids,
+          bst.num_trainers_,
+          bst.trainer_id_,
+          bst.hierarchical_allreduce_inter_nranks_,
           bst.hierarchical_allreduce_exter_nranks_);
     }
   }
@@ -254,7 +261,8 @@ class ParallelExecutorPrivate {
     const std::string var_name = "NCCLCommunicator";
     auto var = scope->FindVar(var_name);
     if (var != nullptr) {
-      PADDLE_ENFORCE_EQ(var->IsInitialized(), true,
+      PADDLE_ENFORCE_EQ(var->IsInitialized(),
+                        true,
                         platform::errors::PreconditionNotMet(
                             "if %s exists, it must be initialized", var_name));
       VLOG(1) << "find " << var_name
@@ -265,19 +273,23 @@ class ParallelExecutorPrivate {
 
     if (bst->use_hierarchical_allreduce_) {
       PADDLE_ENFORCE_GT(
-          bst->num_trainers_, 1,
+          bst->num_trainers_,
+          1,
           platform::errors::PreconditionNotMet(
               "The num_trainers should be greater than 1, but received %llu.",
               bst->num_trainers_));
       PADDLE_ENFORCE_GT(
-          bst->hierarchical_allreduce_inter_nranks_, 1,
+          bst->hierarchical_allreduce_inter_nranks_,
+          1,
           platform::errors::PreconditionNotMet(
               "The inter_nranks should be greater than 1, but received %d.",
               bst->hierarchical_allreduce_inter_nranks_));
       PADDLE_ENFORCE_EQ(
-          bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_, 0,
+          bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_,
+          0,
           platform::errors::PreconditionNotMet(
-              "num_trainers:%llu mod inter_nranks:%d != 0", bst->num_trainers_,
+              "num_trainers:%llu mod inter_nranks:%d != 0",
+              bst->num_trainers_,
               bst->hierarchical_allreduce_inter_nranks_));
 
       bst->hierarchical_allreduce_exter_nranks_ =
@@ -296,15 +308,16 @@ class ParallelExecutorPrivate {
             << ", num_trainers:" << bst.num_trainers_
             << ", trainer_id:" << bst.trainer_id_;
 
-    PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_, false,
+    PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_,
+                      false,
                       platform::errors::Unimplemented(
                           "xpu doesn't support use_hierarchical_allreduce"));
 
     std::vector<BKCLUniqueId *> flat_bkcl_ids;
     if (nranks_ == 1) {
       // FIXME(gongwb): need not to create bkclid when nranks==1
-      bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      bkcl_ctxs_->InitFlatCtxs(
+          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
       return;
     }
 
@@ -320,23 +333,24 @@ class ParallelExecutorPrivate {
         bkcl_id = bkcl_id_var->GetMutable<BKCLUniqueId>();
       } else {
         PADDLE_ENFORCE_EQ(
-            bkcl_get_unique_id(id.get()), BKCL_SUCCESS,
+            bkcl_get_unique_id(id.get()),
+            BKCL_SUCCESS,
             platform::errors::Unavailable("bkcl get unique id failed"));
         bkcl_id = id.get();
       }
 
       flat_bkcl_ids.push_back(bkcl_id);
 
-      bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      bkcl_ctxs_->InitFlatCtxs(
+          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
       VLOG(1) << "init bst bkcl context complete!";
       return;
     }
 
     // num_trainers ==1 && places > 1
     if (bst.num_trainers_ == 1) {
-      bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_,
-                               bst.trainer_id_);
+      bkcl_ctxs_->InitFlatCtxs(
+          places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
       return;
     }
 
@@ -350,8 +364,8 @@ class ParallelExecutorPrivate {
       flat_bkcl_ids.push_back(bkcl_id);
     }
 
-    bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_,
-                             bst.trainer_id_);
+    bkcl_ctxs_->InitFlatCtxs(
+        places_, flat_bkcl_ids, bst.num_trainers_, bst.trainer_id_);
   }
 
   void InitOrGetBKCLCommunicator(framework::Scope *scope,
@@ -359,7 +373,8 @@ class ParallelExecutorPrivate {
     const std::string var_name = "BKCLCommunicator";
     auto var = scope->FindVar(var_name);
     if (var != nullptr) {
-      PADDLE_ENFORCE_EQ(var->IsInitialized(), true,
+      PADDLE_ENFORCE_EQ(var->IsInitialized(),
+                        true,
                         platform::errors::PreconditionNotMet(
                             "if %s exists, it must be initialized", var_name));
       VLOG(1) << "find " << var_name
@@ -679,11 +694,11 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
                     platform::errors::Unavailable(
                         "NPU is not supported in ParallelExecutor."));
   InitP2P(places);
-  ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
-                                 member_->places_.size());
+  ir::InitReaderQueueDeviceCount(
+      graph, *(member_->global_scope_), member_->places_.size());
   // Initialize necessary info of member_ with strategy.
-  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(),
-                                *graph);
+  InitExecutorPrivateMemberInfo(
+      exec_strategy, build_strategy, places.size(), *graph);
 
   // Step 1. Create local scopes and Clone graph into multi device
   CreateLocalScopes(scope, local_scopes, /*create_new*/ true);
@@ -728,22 +743,29 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
   VLOG(3) << "use ScopeBufferedSSAGraphExecutor";
   if (!member_->build_strategy_.async_mode_) {
     member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        std::move(var_infos), member_->places_, std::move(member_->executor_)));
+        exec_strategy,
+        member_->local_scopes_,
+        member_->local_exec_scopes_,
+        std::move(var_infos),
+        member_->places_,
+        std::move(member_->executor_)));
   }
 
   ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map);
   SetReaderOpDeviceInfoOfGraphs(final_graphs);
 }
 
-ParallelExecutor::ParallelExecutor(const platform::Place &place, Scope *scope,
+ParallelExecutor::ParallelExecutor(const platform::Place &place,
+                                   Scope *scope,
                                    const ExecutionStrategy &exec_strategy,
                                    const BuildStrategy &build_strategy,
                                    ir::Graph *graph)
     : member_(new ParallelExecutorPrivate({place}, scope)) {
   // Initialize necessary info of member_ with strategy.
-  InitExecutorPrivateMemberInfo(exec_strategy, build_strategy,
-                                /*device_count=*/1, *graph);
+  InitExecutorPrivateMemberInfo(exec_strategy,
+                                build_strategy,
+                                /*device_count=*/1,
+                                *graph);
 
   CreateLocalScopes(scope, /*local_scope=*/{scope}, /*create_new=*/false);
 
@@ -819,18 +841,24 @@ void ParallelExecutor::BCastParamsToDevices(
         buffers.push_back(buffer);
       }
 
-      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+      PADDLE_ENFORCE_EQ(member_->places_.size(),
+                        buffers.size(),
                         platform::errors::PreconditionNotMet(
                             "variables' buffer size to bcast is %d, which is "
                             "NOT equal to places size %d",
-                            buffers.size(), member_->places_.size()));
+                            buffers.size(),
+                            member_->places_.size()));
       if (member_->nccl_ctxs_ != nullptr) {
         auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx();
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
-                                       nccl_ctx.comm_, nccl_ctx.stream());
+          platform::dynload::ncclBcast(buffers[i],
+                                       numel,
+                                       data_type,
+                                       0,
+                                       nccl_ctx.comm_,
+                                       nccl_ctx.stream());
         }
         nccl_ctxs->WaitAll();
       } else {
@@ -844,8 +872,12 @@ void ParallelExecutor::BCastParamsToDevices(
               platform::DeviceContextPool::Instance().Get(dst_place));
           src_dev_ctx->Wait();
           dst_dev_ctx->Wait();
-          memory::Copy(dst_place, buffers[i], src_place, buffers[0],
-                       sizeof_dtype, src_dev_ctx->stream());
+          memory::Copy(dst_place,
+                       buffers[i],
+                       src_place,
+                       buffers[0],
+                       sizeof_dtype,
+                       src_dev_ctx->stream());
           src_dev_ctx->Wait();
           dst_dev_ctx->Wait();
         }
@@ -879,16 +911,19 @@ void ParallelExecutor::BCastParamsToDevices(
         buffers.push_back(buffer);
       }
 
-      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+      PADDLE_ENFORCE_EQ(member_->places_.size(),
+                        buffers.size(),
                         platform::errors::PreconditionNotMet(
                             "variables' buffer size to bcast is %d, which is "
                             "NOT equal to places size %d",
-                            buffers.size(), member_->places_.size()));
+                            buffers.size(),
+                            member_->places_.size()));
       {
         auto *bkcl_ctxs = member_->bkcl_ctxs_->DefaultFlatCtx();
 
         PADDLE_ENFORCE_EQ(
-            bkcl_group_start(), BKCL_SUCCESS,
+            bkcl_group_start(),
+            BKCL_SUCCESS,
             platform::errors::Unavailable("bkcl_group_start failed"));
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[i]);
@@ -898,13 +933,19 @@ void ParallelExecutor::BCastParamsToDevices(
             broadcast_numel *= 2;
           }
           PADDLE_ENFORCE_EQ(
-              bkcl_broadcast(bkcl_ctx.comm(), buffers[i], buffers[i],
-                             broadcast_numel, data_type, 0, NULL),
+              bkcl_broadcast(bkcl_ctx.comm(),
+                             buffers[i],
+                             buffers[i],
+                             broadcast_numel,
+                             data_type,
+                             0,
+                             NULL),
               BKCL_SUCCESS,
               platform::errors::Unavailable("bkcl_broadcast failed"));
         }
         PADDLE_ENFORCE_EQ(
-            bkcl_group_end(), BKCL_SUCCESS,
+            bkcl_group_end(),
+            BKCL_SUCCESS,
             platform::errors::Unavailable("bkcl_group_end failed"));
       }
 #else
@@ -942,21 +983,24 @@ void ParallelExecutor::BCastParamsToDevices(
 
 FetchResultType ParallelExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
-  platform::RecordEvent record_run("ParallelExecutor::Run",
-                                   platform::TracerEventType::UserDefined, 1);
+  platform::RecordEvent record_run(
+      "ParallelExecutor::Run", platform::TracerEventType::UserDefined, 1);
   VLOG(3) << "enter ParallelExecutor Run";
 #ifdef PADDLE_WITH_CUDA
   if (platform::IsCUDAGraphCapturing()) {
-    PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true,
+    PADDLE_ENFORCE_EQ(fetch_tensors.empty(),
+                      true,
                       platform::errors::InvalidArgument(
                           "Cannot fetch data when using CUDA Graph."));
     PADDLE_ENFORCE_EQ(
-        member_->build_strategy_.allow_cuda_graph_capture_, true,
+        member_->build_strategy_.allow_cuda_graph_capture_,
+        true,
         platform::errors::InvalidArgument(
             "You must turn on build_strategy.allow_cuda_graph_capture = True "
             "to enable CUDA Graph capturing."));
     PADDLE_ENFORCE_EQ(
-        member_->places_[0], platform::CUDAGraphCapturingPlace(),
+        member_->places_[0],
+        platform::CUDAGraphCapturingPlace(),
         platform::errors::InvalidArgument("The place to capture CUDAGraph is "
                                           "not the same as the place to run."));
   }
@@ -972,7 +1016,8 @@ FetchResultType ParallelExecutor::Run(
 
   ResetHasFeedGuard reset_has_feed_guard(member_);
 
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), fetch_tensors,
+  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
+                                fetch_tensors,
                                 member_->HasGarbageCollectors());
 
   VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
@@ -992,7 +1037,8 @@ void ParallelExecutor::RunWithoutFetch(
 
   ResetHasFeedGuard reset_has_feed_guard(member_);
 
-  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), skip_eager_vars,
+  ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_),
+                                skip_eager_vars,
                                 member_->HasGarbageCollectors());
 
   VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run";
@@ -1015,7 +1061,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
   if (platform::IsCUDAGraphCapturing()) {
     for (auto &tensor : tensors) {
       PADDLE_ENFORCE_EQ(
-          tensor.empty(), true,
+          tensor.empty(),
+          true,
           platform::errors::PermissionDenied(
               "Feeding data is not permitted when capturing CUDA Graph."));
     }
@@ -1023,7 +1070,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
   }
 
   if (!member_->AllowPartialFeed()) {
-    PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(),
+    PADDLE_ENFORCE_EQ(tensors.size(),
+                      member_->local_scopes_.size(),
                       platform::errors::Unimplemented(
                           "The feed data number %d does not match the device "
                           "number %d. If you are using DataLoader to feed "
@@ -1031,9 +1079,11 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
                           "in training network. Currently, drop_last=False for "
                           "DataLoader is not supported for training network. "
                           "Please set drop_last=True when defining DataLoader.",
-                          tensors.size(), member_->local_scopes_.size()));
+                          tensors.size(),
+                          member_->local_scopes_.size()));
   } else {
-    PADDLE_ENFORCE_GE(member_->local_scopes_.size(), tensors.size(),
+    PADDLE_ENFORCE_GE(member_->local_scopes_.size(),
+                      tensors.size(),
                       platform::errors::InvalidArgument(
                           "The feed tensor number exceeds the device number"));
   }
@@ -1063,7 +1113,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
   }
 
   if (!member_->AllowPartialFeed()) {
-    PADDLE_ENFORCE_EQ(feed_num, member_->local_scopes_.size(),
+    PADDLE_ENFORCE_EQ(feed_num,
+                      member_->local_scopes_.size(),
                       platform::errors::Unimplemented(
                           "The feed data number %d does not match the device "
                           "number %d. If you are using DataLoader to feed "
@@ -1071,7 +1122,8 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes(
                           "in training network. Currently, drop_last=False for "
                           "DataLoader is not supported for training network. "
                           "Please set drop_last=True when defining DataLoader.",
-                          feed_num, member_->local_scopes_.size()));
+                          feed_num,
+                          member_->local_scopes_.size()));
   }
 }
 
@@ -1079,7 +1131,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
     const std::unordered_map<std::string, LoDTensor> &tensors) {
   if (platform::IsCUDAGraphCapturing()) {
     PADDLE_ENFORCE_EQ(
-        tensors.empty(), true,
+        tensors.empty(),
+        true,
         platform::errors::PermissionDenied(
             "Feeding data is not permitted when capturing CUDA Graph."));
     return;
@@ -1103,7 +1156,9 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       auto error_info = string::Sprintf(
           "The number(%d) of samples[%s] of current batch is less than the "
           "count(%d) of devices(%s), currently, it is not allowed. ",
-          lod_tensors.size(), pair.first, num_places,
+          lod_tensors.size(),
+          pair.first,
+          num_places,
           (is_cpu_place ? "CPU" : "GPU"));
       if (is_cpu_place) {
         error_info +=
@@ -1116,10 +1171,12 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
         lod_tensors.reserve(num_places);
         auto &tensor = lod_tensors.front();
         PADDLE_ENFORCE_EQ(
-            tensor.dims(), pair.second.dims(),
+            tensor.dims(),
+            pair.second.dims(),
             platform::errors::PreconditionNotMet("The dim doesn't match."));
         PADDLE_ENFORCE_EQ(
-            tensor.place(), member_->places_.at(0),
+            tensor.place(),
+            member_->places_.at(0),
             platform::errors::PreconditionNotMet("The place doesn't match."));
         for (size_t i = 1; i < num_places; ++i) {
           lod_tensors.emplace_back();
@@ -1135,8 +1192,13 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
             "sample will be copied in %d copies and be sent to different "
             "places separately. If you need that different place has different "
             "value, you should feed %d samples.",
-            lod_tensors.size(), pair.first, num_places,
-            (is_cpu_place ? "CPU" : "GPU"), pair.first, num_places, num_places);
+            lod_tensors.size(),
+            pair.first,
+            num_places,
+            (is_cpu_place ? "CPU" : "GPU"),
+            pair.first,
+            num_places,
+            num_places);
         PADDLE_THROW(platform::errors::PreconditionNotMet(error_info));
       }
     }
@@ -1147,7 +1209,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
           persistable_feed_len = lod_tensors.size();
         } else {
           PADDLE_ENFORCE_EQ(
-              persistable_feed_len, lod_tensors.size(),
+              persistable_feed_len,
+              lod_tensors.size(),
               platform::errors::InvalidArgument(
                   "The feeded number of different persistable variables "
                   "should be the same"));
@@ -1157,7 +1220,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
           non_persistable_feed_len = lod_tensors.size();
         } else {
           PADDLE_ENFORCE_EQ(
-              non_persistable_feed_len, lod_tensors.size(),
+              non_persistable_feed_len,
+              lod_tensors.size(),
               platform::errors::InvalidArgument(
                   "The feeded number of different non-persistable variables "
                   "should be the same"));
@@ -1180,7 +1244,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       non_persistable_feed_len != -1UL) {
     VLOG(10) << "Persistable len " << persistable_feed_len;
     VLOG(10) << "Non persistable len " << non_persistable_feed_len;
-    PADDLE_ENFORCE_GE(persistable_feed_len, non_persistable_feed_len,
+    PADDLE_ENFORCE_GE(persistable_feed_len,
+                      non_persistable_feed_len,
                       platform::errors::InvalidArgument(
                           "The feeded number of persistable variables should "
                           "not be less than non-persistable variables"));
@@ -1201,7 +1266,8 @@ ParallelExecutor::~ParallelExecutor() {
 }
 
 bool ParallelExecutor::EnableParallelGraphExecution(
-    const ir::Graph &graph, const ExecutionStrategy &exec_strategy,
+    const ir::Graph &graph,
+    const ExecutionStrategy &exec_strategy,
     const BuildStrategy &build_strategy) const {
   if (!FLAGS_enable_parallel_graph) {
     return false;
@@ -1242,8 +1308,10 @@ bool ParallelExecutor::EnableParallelGraphExecution(
 }
 
 void ParallelExecutor::InitExecutorPrivateMemberInfo(
-    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy,
-    size_t device_count, const ir::Graph &graph) {
+    const ExecutionStrategy &exec_strategy,
+    const BuildStrategy &build_strategy,
+    size_t device_count,
+    const ir::Graph &graph) {
   member_->use_device_ = exec_strategy.use_device_;
   member_->build_strategy_ = build_strategy;
   member_->use_all_reduce_ = member_->build_strategy_.reduce_ ==
@@ -1259,7 +1327,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
 #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
-        device_count, 1,
+        device_count,
+        1,
         platform::errors::Unavailable("Windows can support Single GPU only."));
   }
 #endif
@@ -1268,7 +1337,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
     (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
-        device_count, 1,
+        device_count,
+        1,
         platform::errors::PermissionDenied(
             "Your machine has multiple cards, "
             "but the WITH_NCCL option is not turned on during compilation, "
@@ -1289,14 +1359,16 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   VLOG(1) << string::Sprintf(
       "The Program will be executed on %s using ParallelExecutor, %lu "
       "cards are used, so %lu programs are executed in parallel.",
-      device_name, device_count, device_count);
+      device_name,
+      device_count,
+      device_count);
 
   // FIXME(Yancey1989): parallel graph mode get better performance
   // in GPU allreduce distributed training. Need an elegant way to
   // choice the execution strategy.
   member_->build_strategy_.enable_parallel_graph_ =
-      EnableParallelGraphExecution(graph, exec_strategy,
-                                   member_->build_strategy_);
+      EnableParallelGraphExecution(
+          graph, exec_strategy, member_->build_strategy_);
   if (member_->build_strategy_.enable_parallel_graph_) {
     LOG(INFO) << "The Executor would execute the graph by ParallelGraph "
                  "Execution which can get better performance,"
@@ -1305,7 +1377,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
 }
 
 void ParallelExecutor::CreateLocalScopes(
-    Scope *global_scope, const std::vector<Scope *> &local_scopes,
+    Scope *global_scope,
+    const std::vector<Scope *> &local_scopes,
     bool create_new) {
   if (local_scopes.empty()) {
     member_->own_local_scope_ = true;
@@ -1315,11 +1388,13 @@ void ParallelExecutor::CreateLocalScopes(
     }
   } else {
     member_->own_local_scope_ = false;
-    PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(),
+    PADDLE_ENFORCE_EQ(member_->places_.size(),
+                      local_scopes.size(),
                       platform::errors::PreconditionNotMet(
                           "member_->places_.size() = %d is not equal to "
                           "local_scopes.size() = %d",
-                          member_->places_.size(), local_scopes.size()));
+                          member_->places_.size(),
+                          local_scopes.size()));
     for (size_t i = 0; i < member_->places_.size(); ++i) {
       if (create_new) {
         member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope());
@@ -1344,12 +1419,13 @@ std::unordered_map<Scope *, Scope *> ParallelExecutor::CreateLocalExecScopes(
     scope_map.emplace(scope, local_exec_scope);
   }
 
-  PADDLE_ENFORCE_EQ(
-      member_->local_scopes_.size(), member_->local_exec_scopes_.size(),
-      platform::errors::PreconditionNotMet(
-          "member_->local_scopes_.size() = %d is not equal to "
-          "member_->local_exec_scopes_.size() = %d",
-          member_->local_scopes_.size(), member_->local_exec_scopes_.size()));
+  PADDLE_ENFORCE_EQ(member_->local_scopes_.size(),
+                    member_->local_exec_scopes_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "member_->local_scopes_.size() = %d is not equal to "
+                        "member_->local_exec_scopes_.size() = %d",
+                        member_->local_scopes_.size(),
+                        member_->local_exec_scopes_.size()));
 
   return scope_map;
 }
@@ -1358,7 +1434,8 @@ std::vector<ir::Graph *> ParallelExecutor::CloneGraphToMultiDevices(
     ir::Graph *graph) {
   std::vector<ir::Graph *> graphs;
   if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false,
+    PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_),
+                      false,
                       platform::errors::Unavailable(
                           "gpu mode does not support async_mode_ now!"));
     graphs.push_back(graph);
@@ -1424,7 +1501,8 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
 }
 
 std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
-    ir::Graph *graph, std::vector<ir::Graph *> *device_graphs,
+    ir::Graph *graph,
+    std::vector<ir::Graph *> *device_graphs,
     const std::string &loss_var_name) {
   auto device_count = member_->places_.size();
   std::vector<ir::Graph *> async_graphs(device_count);
@@ -1432,66 +1510,99 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
   auto &graphs = *device_graphs;
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+    PADDLE_ENFORCE_EQ(graphs.size(),
+                      device_count,
                       platform::errors::PreconditionNotMet(
                           "graphs.size() shoule be %d, but received %d",
-                          device_count, graphs.size()));
+                          device_count,
+                          graphs.size()));
     VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->nccl_ctxs_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           {member_->places_[0]},
+                                           loss_var_name,
+                                           {member_->local_scopes_[0]},
+                                           1,
+                                           member_->use_device_,
+                                           member_->nccl_ctxs_);
     for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->nccl_ctxs_);
+      graphs[i] = member_->build_strategy_.Apply(graphs[i],
+                                                 {member_->places_[i]},
+                                                 loss_var_name,
+                                                 {member_->local_scopes_[i]},
+                                                 1,
+                                                 member_->use_device_,
+                                                 member_->nccl_ctxs_);
       async_graphs[i] = graphs[i];
     }
   } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->nccl_ctxs_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           member_->places_,
+                                           loss_var_name,
+                                           member_->local_scopes_,
+                                           member_->nranks_,
+                                           member_->use_device_,
+                                           member_->nccl_ctxs_);
   }
 #elif defined(PADDLE_WITH_XPU_BKCL)
   if (member_->build_strategy_.async_mode_) {
-    PADDLE_ENFORCE_EQ(graphs.size(), device_count,
+    PADDLE_ENFORCE_EQ(graphs.size(),
+                      device_count,
                       platform::errors::PreconditionNotMet(
                           "graphs.size() shoule be %d, but received %d",
-                          device_count, graphs.size()));
+                          device_count,
+                          graphs.size()));
     VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_,
-        member_->bkcl_ctxs_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           {member_->places_[0]},
+                                           loss_var_name,
+                                           {member_->local_scopes_[0]},
+                                           1,
+                                           member_->use_device_,
+                                           member_->bkcl_ctxs_);
     for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_,
-          member_->bkcl_ctxs_);
+      graphs[i] = member_->build_strategy_.Apply(graphs[i],
+                                                 {member_->places_[i]},
+                                                 loss_var_name,
+                                                 {member_->local_scopes_[i]},
+                                                 1,
+                                                 member_->use_device_,
+                                                 member_->bkcl_ctxs_);
       async_graphs[i] = graphs[i];
     }
   } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_, member_->bkcl_ctxs_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           member_->places_,
+                                           loss_var_name,
+                                           member_->local_scopes_,
+                                           member_->nranks_,
+                                           member_->use_device_,
+                                           member_->bkcl_ctxs_);
   }
 #else
   if (member_->build_strategy_.async_mode_) {
     VLOG(3) << "use local async mode";
-    graph = member_->build_strategy_.Apply(
-        graph, {member_->places_[0]}, loss_var_name,
-        {member_->local_scopes_[0]}, 1, member_->use_device_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           {member_->places_[0]},
+                                           loss_var_name,
+                                           {member_->local_scopes_[0]},
+                                           1,
+                                           member_->use_device_);
     for (size_t i = 1; i < device_count; ++i) {
-      graphs[i] = member_->build_strategy_.Apply(
-          graphs[i], {member_->places_[i]}, loss_var_name,
-          {member_->local_scopes_[i]}, 1, member_->use_device_);
+      graphs[i] = member_->build_strategy_.Apply(graphs[i],
+                                                 {member_->places_[i]},
+                                                 loss_var_name,
+                                                 {member_->local_scopes_[i]},
+                                                 1,
+                                                 member_->use_device_);
       async_graphs[i] = graphs[i];
     }
   } else {
-    graph = member_->build_strategy_.Apply(
-        graph, member_->places_, loss_var_name, member_->local_scopes_,
-        member_->nranks_, member_->use_device_);
+    graph = member_->build_strategy_.Apply(graph,
+                                           member_->places_,
+                                           loss_var_name,
+                                           member_->local_scopes_,
+                                           member_->nranks_,
+                                           member_->use_device_);
   }
 #endif
 
@@ -1501,11 +1612,13 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
 void ParallelExecutor::CreateVariableInfos(
     std::vector<details::VariableInfo> *var_infos, ir::Graph *graph) {
   PADDLE_ENFORCE_EQ(
-      var_infos->size(), 0,
+      var_infos->size(),
+      0,
       platform::errors::PreconditionNotMet(
           "var_infos->size() shoule be 0, but received %d", var_infos->size()));
   PADDLE_ENFORCE_EQ(
-      member_->is_persistable_.size(), 0,
+      member_->is_persistable_.size(),
+      0,
       platform::errors::PreconditionNotMet(
           "member_->is_persistable_.size() shoule be 0, but received %d",
           member_->is_persistable_.size()));
@@ -1535,14 +1648,18 @@ void ParallelExecutor::CreateVariableInfos(
 
 std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     const ExecutionStrategy &exec_strategy,
-    std::vector<ir::Graph *> *async_graphs, ir::Graph *graph) {
+    std::vector<ir::Graph *> *async_graphs,
+    ir::Graph *graph) {
   std::vector<ir::Graph *> final_graphs;
 
   if (member_->build_strategy_.async_mode_) {
     VLOG(3) << "use AsyncSSAGraphExecutor";
-    member_->executor_.reset(new details::AsyncSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, *async_graphs));
+    member_->executor_.reset(
+        new details::AsyncSSAGraphExecutor(exec_strategy,
+                                           member_->local_scopes_,
+                                           member_->local_exec_scopes_,
+                                           member_->places_,
+                                           *async_graphs));
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
@@ -1552,9 +1669,12 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
     bool has_drop_last_read_op = details::HasDropLastReadOp(*graph);
 
-    auto *pg_exe = new details::ParallelSSAGraphExecutor(
-        exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-        member_->places_, graph);
+    auto *pg_exe =
+        new details::ParallelSSAGraphExecutor(exec_strategy,
+                                              member_->local_scopes_,
+                                              member_->local_exec_scopes_,
+                                              member_->places_,
+                                              graph);
     final_graphs = pg_exe->Graphs();
     member_->executor_.reset(pg_exe);
 
@@ -1580,8 +1700,11 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
 
       VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase";
       auto *pg_exe = new details::ParallelSSAGraphExecutor(
-          exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-          member_->places_, std::move(possible_inference_graphs));
+          exec_strategy,
+          member_->local_scopes_,
+          member_->local_exec_scopes_,
+          member_->places_,
+          std::move(possible_inference_graphs));
       if (!has_drop_last_read_op) {
         VLOG(5) << "Enable partial feed support in inference phase";
         pg_exe->EnablePartialFeedSupport();
@@ -1598,16 +1721,22 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
              "network. It is automatically turned to drop_last=True.";
       if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
         VLOG(3) << "use ThreadedSSAGraphExecutor";
-        member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-            exec_strategy, member_->local_scopes_, member_->local_exec_scopes_,
-            member_->places_, graph));
+        member_->executor_.reset(
+            new details::ThreadedSSAGraphExecutor(exec_strategy,
+                                                  member_->local_scopes_,
+                                                  member_->local_exec_scopes_,
+                                                  member_->places_,
+                                                  graph));
       } else {
         if (member_->use_device_ == p::kXPU) {
 #if defined(PADDLE_WITH_XPU)
           VLOG(3) << "use BindThreadedSSAGraphExecutor";
           member_->executor_.reset(new details::BindThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
+              exec_strategy,
+              member_->local_scopes_,
+              member_->local_exec_scopes_,
+              member_->places_,
+              graph));
 #else
           PADDLE_THROW(platform::errors::PermissionDenied(
               "Paddle can't use XPU device since it's not compiled with XPU,"
@@ -1616,8 +1745,11 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
         } else {
           VLOG(3) << "use FastThreadedSSAGraphExecutor";
           member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
-              exec_strategy, member_->local_scopes_,
-              member_->local_exec_scopes_, member_->places_, graph));
+              exec_strategy,
+              member_->local_scopes_,
+              member_->local_exec_scopes_,
+              member_->places_,
+              graph));
         }
       }
       final_graphs.emplace_back(graph);
@@ -1630,12 +1762,14 @@ void ParallelExecutor::ResetOpHandleScopeMapOfGraphs(
     const std::vector<ir::Graph *> &final_graphs,
     const std::unordered_map<Scope *, Scope *> &scope_map) {
   PADDLE_ENFORCE_GE(
-      final_graphs.size(), 1,
+      final_graphs.size(),
+      1,
       platform::errors::PreconditionNotMet(
           "final_graphs shoule contain at least one graph, but received %d",
           final_graphs.size()));
 
-  PADDLE_ENFORCE_GT(scope_map.size(), 0,
+  PADDLE_ENFORCE_GT(scope_map.size(),
+                    0,
                     platform::errors::PreconditionNotMet(
                         "scope_map shoule contain at least one "
                         "element, but received %d",
@@ -1676,21 +1810,26 @@ void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) {
   if (!build_strategy.allow_cuda_graph_capture_) return;
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_EQ(
-      build_strategy.async_mode_, false,
+      build_strategy.async_mode_,
+      false,
       platform::errors::InvalidArgument(
           "Async Executor does not support CUDA Graph capturing."));
   PADDLE_ENFORCE_EQ(
-      platform::IsCUDAGraphCapturing(), false,
+      platform::IsCUDAGraphCapturing(),
+      false,
       platform::errors::PermissionDenied("CUDA Graph is not allowed to capture "
                                          "when running the first batch."));
   PADDLE_ENFORCE_EQ(
-      member_->places_.size(), 1,
+      member_->places_.size(),
+      1,
       platform::errors::InvalidArgument(
           "CUDA Graph is only supported when one GPU device is running."));
-  PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true,
+  PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]),
+                    true,
                     platform::errors::InvalidArgument(
                         "CUDA Graph is only supported on NVIDIA GPU device."));
-  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false,
+  PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce,
+                    false,
                     platform::errors::InvalidArgument(
                         "FLAGS_sync_nccl_allreduce must be False to support "
                         "CUDA Graph capturing."));
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index ac7bf6d87da2d..3015224656890 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -91,8 +91,8 @@ void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
                 << framework::DataTypeToString(var->ForwardDataType())
                 << " real var in dynamic graph.";
         framework::Tensor out;
-        framework::TransComplexToReal(var->ForwardDataType(), var->DataType(),
-                                      *tensor, &out);
+        framework::TransComplexToReal(
+            var->ForwardDataType(), var->DataType(), *tensor, &out);
         SetTensorToVariable(var->Var(), out, var->MutableVar());
       }
     }
@@ -147,8 +147,10 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
 
 template <typename VarType>
 PreparedOp PrepareImpl(
-    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
-    const framework::OperatorWithKernel& op, const platform::Place& place,
+    const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs,
+    const framework::OperatorWithKernel& op,
+    const platform::Place& place,
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
     const phi::KernelFactory& phi_kernel_factory,
@@ -267,9 +269,14 @@ PreparedOp PrepareImpl(
         dev_ctx = pool.Get(expected_kernel_key.place_);
       }
 
-      return PreparedOp(op, empty_ctx, expected_kernel_key, arg_map_fn,
-                        default_kernel_signature, std::move(kernel_signature),
-                        phi_kernel, dev_ctx);
+      return PreparedOp(op,
+                        empty_ctx,
+                        expected_kernel_key,
+                        arg_map_fn,
+                        default_kernel_signature,
+                        std::move(kernel_signature),
+                        phi_kernel,
+                        dev_ctx);
     } else {
       VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name
               << "` not found.";
@@ -318,16 +325,21 @@ PreparedOp PrepareImpl(
                 << " | kernel: " << pt_cpu_kernel;
         auto* cpu_ctx = pool.Get(paddle::platform::CPUPlace());
         return PreparedOp(
-            op, empty_ctx,
+            op,
+            empty_ctx,
             framework::TransPhiKernelKeyToOpKernelType(pt_cpu_kernel_key),
-            arg_map_fn, default_kernel_signature, std::move(kernel_signature),
-            pt_cpu_kernel, cpu_ctx);
+            arg_map_fn,
+            default_kernel_signature,
+            std::move(kernel_signature),
+            pt_cpu_kernel,
+            cpu_ctx);
       }
     }
   }
 
   PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
+      kernels_iter,
+      all_op_kernels.end(),
       platform::errors::NotFound(
           "There are no kernels which are registered in the %s operator.",
           op.Type()));
@@ -413,17 +425,24 @@ PreparedOp PrepareImpl(
 #endif
   // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
   // case
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    platform::errors::NotFound(
-                        "Operator %s does not have kernel for %s.", op.Type(),
-                        KernelTypeToString(expected_kernel_key)));
+  PADDLE_ENFORCE_NE(
+      kernel_iter,
+      kernels.end(),
+      platform::errors::NotFound("Operator %s does not have kernel for %s.",
+                                 op.Type(),
+                                 KernelTypeToString(expected_kernel_key)));
 
   if (!(expected_kernel_key.place_ == place)) {
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  return PreparedOp(op, empty_ctx, expected_kernel_key, kernel_iter->second,
-                    arg_map_fn, default_kernel_signature, dev_ctx);
+  return PreparedOp(op,
+                    empty_ctx,
+                    expected_kernel_key,
+                    kernel_iter->second,
+                    arg_map_fn,
+                    default_kernel_signature,
+                    dev_ctx);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
@@ -432,8 +451,14 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
-                              phi_kernel_factory, phi_op_utils_map,
+  return PrepareImpl<VarBase>(ins,
+                              outs,
+                              op,
+                              place,
+                              attrs,
+                              default_attrs,
+                              phi_kernel_factory,
+                              phi_op_utils_map,
                               default_phi_kernel_sig_map);
 }
 
@@ -443,9 +468,15 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<VariableWrapper>(
-      ins, outs, op, place, attrs, default_attrs, phi_kernel_factory,
-      phi_op_utils_map, default_phi_kernel_sig_map);
+  return PrepareImpl<VariableWrapper>(ins,
+                                      outs,
+                                      op,
+                                      place,
+                                      attrs,
+                                      default_attrs,
+                                      phi_kernel_factory,
+                                      phi_op_utils_map,
+                                      default_phi_kernel_sig_map);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerVariable>& ins,
@@ -454,39 +485,55 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<egr::EagerVariable>& ins,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
                                const framework::AttributeMap& default_attrs) {
-  return PrepareImpl<egr::EagerVariable>(
-      ins, outs, op, place, attrs, default_attrs, phi_kernel_factory,
-      phi_op_utils_map, default_phi_kernel_sig_map);
+  return PrepareImpl<egr::EagerVariable>(ins,
+                                         outs,
+                                         op,
+                                         place,
+                                         attrs,
+                                         default_attrs,
+                                         phi_kernel_factory,
+                                         phi_op_utils_map,
+                                         default_phi_kernel_sig_map);
 }
 template <typename VarType>
 static void PreparedOpRunImpl(
-    const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
+    const framework::OperatorBase& op,
+    const framework::RuntimeContext& ctx,
     const framework::OpKernelType& kernel_type,
     const framework::OperatorWithKernel::OpKernelFunc& func,
     const phi::ArgumentMappingFn* arg_map_fn,
     const phi::KernelSignature* default_kernel_signature,
-    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
-    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    platform::DeviceContext* dev_ctx,
+    const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   // TODO(zjl): remove scope in dygraph
 
   {
     platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
-    DygraphInferShapeContext<VarType> infer_shape_ctx(
-        &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type,
-        arg_map_fn, default_kernel_signature);
+                                       1,
+                                       platform::EventRole::kInnerOp);
+    DygraphInferShapeContext<VarType> infer_shape_ctx(&ins,
+                                                      &outs,
+                                                      &attrs,
+                                                      &default_attrs,
+                                                      op.Type(),
+                                                      &kernel_type,
+                                                      arg_map_fn,
+                                                      default_kernel_signature);
     op.Info().infer_shape_(&infer_shape_ctx);
   }
 
   {
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
 
-    func(DygraphExecutionContext<VarType>(op, empty_scope, *dev_ctx, ctx, ins,
-                                          outs, attrs, default_attrs));
+    func(DygraphExecutionContext<VarType>(
+        op, empty_scope, *dev_ctx, ctx, ins, outs, attrs, default_attrs));
   }
 
   if (FLAGS_check_nan_inf) {
@@ -525,30 +572,45 @@ static void PreparedOpRunPtImpl(
     const framework::OpKernelType& kernel_type,
     const phi::ArgumentMappingFn* arg_map_fn,
     const phi::KernelSignature* default_kernel_signature,
-    const phi::KernelSignature& kernel_signature, const phi::Kernel& phi_kernel,
-    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
-    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    const phi::KernelSignature& kernel_signature,
+    const phi::Kernel& phi_kernel,
+    platform::DeviceContext* dev_ctx,
+    const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   {
     platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
-    DygraphInferShapeContext<VarType> infer_shape_ctx(
-        &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type,
-        arg_map_fn, default_kernel_signature);
+                                       1,
+                                       platform::EventRole::kInnerOp);
+    DygraphInferShapeContext<VarType> infer_shape_ctx(&ins,
+                                                      &outs,
+                                                      &attrs,
+                                                      &default_attrs,
+                                                      op.Type(),
+                                                      &kernel_type,
+                                                      arg_map_fn,
+                                                      default_kernel_signature);
     op.Info().infer_shape_(&infer_shape_ctx);
   }
 
   {
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
 
     PreparePhiData<VarType>(phi_kernel, kernel_signature, ins);
 
     phi::KernelContext pt_kernel_context;
-    BuildDygraphPhiKernelContext<VarType>(kernel_signature, phi_kernel, ins,
-                                          outs, attrs, default_attrs, dev_ctx,
+    BuildDygraphPhiKernelContext<VarType>(kernel_signature,
+                                          phi_kernel,
+                                          ins,
+                                          outs,
+                                          attrs,
+                                          default_attrs,
+                                          dev_ctx,
                                           &pt_kernel_context);
 
     phi_kernel(&pt_kernel_context);
@@ -577,14 +639,29 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_phi_kernel_) {
-    PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, arg_map_fn_,
-                                 default_kernel_signature_, kernel_signature_,
-                                 phi_kernel_, dev_ctx_, ins, outs, attrs,
+    PreparedOpRunPtImpl<VarBase>(op_,
+                                 kernel_type_,
+                                 arg_map_fn_,
+                                 default_kernel_signature_,
+                                 kernel_signature_,
+                                 phi_kernel_,
+                                 dev_ctx_,
+                                 ins,
+                                 outs,
+                                 attrs,
                                  default_attrs);
   } else {
-    PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, arg_map_fn_,
-                               default_kernel_signature_, dev_ctx_, ins, outs,
-                               attrs, default_attrs);
+    PreparedOpRunImpl<VarBase>(op_,
+                               ctx_,
+                               kernel_type_,
+                               func_,
+                               arg_map_fn_,
+                               default_kernel_signature_,
+                               dev_ctx_,
+                               ins,
+                               outs,
+                               attrs,
+                               default_attrs);
   }
 }
 
@@ -593,14 +670,29 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_phi_kernel_) {
-    PreparedOpRunPtImpl<VariableWrapper>(
-        op_, kernel_type_, arg_map_fn_, default_kernel_signature_,
-        kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs,
-        default_attrs);
+    PreparedOpRunPtImpl<VariableWrapper>(op_,
+                                         kernel_type_,
+                                         arg_map_fn_,
+                                         default_kernel_signature_,
+                                         kernel_signature_,
+                                         phi_kernel_,
+                                         dev_ctx_,
+                                         ins,
+                                         outs,
+                                         attrs,
+                                         default_attrs);
   } else {
-    PreparedOpRunImpl<VariableWrapper>(
-        op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_,
-        dev_ctx_, ins, outs, attrs, default_attrs);
+    PreparedOpRunImpl<VariableWrapper>(op_,
+                                       ctx_,
+                                       kernel_type_,
+                                       func_,
+                                       arg_map_fn_,
+                                       default_kernel_signature_,
+                                       dev_ctx_,
+                                       ins,
+                                       outs,
+                                       attrs,
+                                       default_attrs);
   }
 }
 
@@ -609,14 +701,29 @@ void PreparedOp::Run(const NameVarMap<egr::EagerVariable>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_phi_kernel_) {
-    PreparedOpRunPtImpl<egr::EagerVariable>(
-        op_, kernel_type_, arg_map_fn_, default_kernel_signature_,
-        kernel_signature_, phi_kernel_, dev_ctx_, ins, outs, attrs,
-        default_attrs);
+    PreparedOpRunPtImpl<egr::EagerVariable>(op_,
+                                            kernel_type_,
+                                            arg_map_fn_,
+                                            default_kernel_signature_,
+                                            kernel_signature_,
+                                            phi_kernel_,
+                                            dev_ctx_,
+                                            ins,
+                                            outs,
+                                            attrs,
+                                            default_attrs);
   } else {
-    PreparedOpRunImpl<egr::EagerVariable>(
-        op_, ctx_, kernel_type_, func_, arg_map_fn_, default_kernel_signature_,
-        dev_ctx_, ins, outs, attrs, default_attrs);
+    PreparedOpRunImpl<egr::EagerVariable>(op_,
+                                          ctx_,
+                                          kernel_type_,
+                                          func_,
+                                          arg_map_fn_,
+                                          default_kernel_signature_,
+                                          dev_ctx_,
+                                          ins,
+                                          outs,
+                                          attrs,
+                                          default_attrs);
   }
 }
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 4c4ddf24dab68..5b1c4d3376cb8 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -181,15 +181,23 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
 }
 
 template <typename VarType>
-void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
+void Tracer::TraceOp(const std::string& type,
+                     const NameVarMap<VarType>& ins,
                      const NameVarMap<VarType>& outs,
                      framework::AttributeMap attrs,
-                     const platform::Place& place, bool trace_backward,
+                     const platform::Place& place,
+                     bool trace_backward,
                      const std::map<std::string, std::string>& inplace_map,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
-  TraceOpImpl<VarType>(type, ins, outs, attrs, place, trace_backward,
-                       inplace_map, passed_default_attrs_,
+  TraceOpImpl<VarType>(type,
+                       ins,
+                       outs,
+                       attrs,
+                       place,
+                       trace_backward,
+                       inplace_map,
+                       passed_default_attrs_,
                        use_default_attr_map);
 }
 
@@ -198,7 +206,8 @@ void Tracer::TraceOpImpl(const std::string& type,
                          const NameVarMap<VarType>& ins,
                          const NameVarMap<VarType>& outs,
                          framework::AttributeMap& attrs,
-                         const platform::Place& place, bool trace_backward,
+                         const platform::Place& place,
+                         bool trace_backward,
                          const std::map<std::string, std::string>& inplace_map,
                          paddle::framework::AttributeMap* passed_default_attrs_,
                          bool use_default_attr_map) {
@@ -237,8 +246,9 @@ void Tracer::TraceOpImpl(const std::string& type,
       const auto& tracer = imperative::GetCurrentTracer();
       VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type;
       ins_amp = std::make_unique<NameVarMap<VarType>>(
-          AutoCastInputs<VarType>(type, imperative::AutoTuneLayout<VarType>(
-                                            type, ins, outs, &attrs, tracer)));
+          AutoCastInputs<VarType>(type,
+                                  imperative::AutoTuneLayout<VarType>(
+                                      type, ins, outs, &attrs, tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type;
       ins_amp = std::make_unique<NameVarMap<VarType>>(
@@ -248,10 +258,10 @@ void Tracer::TraceOpImpl(const std::string& type,
     if (amp_dtype_ == phi::DataType::FLOAT16) {
       const auto& tracer = imperative::GetCurrentTracer();
       VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type;
-      ins_amp =
-          std::make_unique<NameVarMap<VarType>>(CastPureFp16Inputs<VarType>(
-              type, imperative::AutoTuneLayout<VarType>(type, ins, outs, &attrs,
-                                                        tracer)));
+      ins_amp = std::make_unique<NameVarMap<VarType>>(
+          CastPureFp16Inputs<VarType>(type,
+                                      imperative::AutoTuneLayout<VarType>(
+                                          type, ins, outs, &attrs, tracer)));
     } else if (amp_dtype_ == phi::DataType::BFLOAT16) {
       VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type;
       ins_amp = std::make_unique<NameVarMap<VarType>>(
@@ -316,10 +326,12 @@ void Tracer::TraceOpImpl(const std::string& type,
     framework::AppendErrorOpHint(type, &exception);
     throw std::move(exception);
   } catch (std::exception& ex) {
-    PADDLE_THROW(platform::errors::Fatal(
-        "Operator %s raises an %s exception.\n"
-        "The exception content is\n:%s.",
-        type, platform::demangle(typeid(ex).name()), ex.what()));
+    PADDLE_THROW(
+        platform::errors::Fatal("Operator %s raises an %s exception.\n"
+                                "The exception content is\n:%s.",
+                                type,
+                                platform::demangle(typeid(ex).name()),
+                                ex.what()));
   } catch (...) {
     // NOTE: this branch represents a very serious bug with
     // low probability of occurrence, and we can't get its
@@ -339,13 +351,14 @@ void Tracer::TraceOpImpl(const std::string& type,
 
     if (ComputeRequiredGrad(new_ins, outs, trace_backward)) {
       PADDLE_ENFORCE_EQ(
-          passed_default_attrs_, nullptr,
+          passed_default_attrs_,
+          nullptr,
           paddle::platform::errors::PermissionDenied(
               "We expect passed_default_attrs_ is nullptr while "
               "use_default_attr_map is true, however we got not null "
               "passed_default_attrs_. Please check your usage of trace_op. "));
-      CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place,
-                       inplace_map);
+      CreateGradOpNode(
+          *op, new_ins, outs, attrs, default_attrs, place, inplace_map);
     } else {
       VLOG(3) << "No Grad to track for Op: " << type;
     }
@@ -354,27 +367,43 @@ void Tracer::TraceOpImpl(const std::string& type,
 }
 
 template void Tracer::TraceOp<VarBase>(
-    const std::string& type, const NameVarMap<VarBase>& ins,
-    const NameVarMap<VarBase>& outs, framework::AttributeMap attrs,
-    const platform::Place& place, bool trace_backward,
+    const std::string& type,
+    const NameVarMap<VarBase>& ins,
+    const NameVarMap<VarBase>& outs,
+    framework::AttributeMap attrs,
+    const platform::Place& place,
+    bool trace_backward,
     const std::map<std::string, std::string>& inplace_map,
-    paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map);
+    paddle::framework::AttributeMap* default_attrs,
+    bool use_default_attr_map);
 
 template void Tracer::TraceOp<egr::EagerVariable>(
-    const std::string& type, const NameVarMap<egr::EagerVariable>& ins,
-    const NameVarMap<egr::EagerVariable>& outs, framework::AttributeMap attrs,
-    const platform::Place& place, bool trace_backward,
+    const std::string& type,
+    const NameVarMap<egr::EagerVariable>& ins,
+    const NameVarMap<egr::EagerVariable>& outs,
+    framework::AttributeMap attrs,
+    const platform::Place& place,
+    bool trace_backward,
     const std::map<std::string, std::string>& inplace_map_,
-    paddle::framework::AttributeMap* default_attrs, bool use_default_attr_map);
+    paddle::framework::AttributeMap* default_attrs,
+    bool use_default_attr_map);
 
-void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
-                     const NameVarBaseMap& outs, framework::AttributeMap attrs,
+void Tracer::TraceOp(const std::string& type,
+                     const NameVarBaseMap& ins,
+                     const NameVarBaseMap& outs,
+                     framework::AttributeMap attrs,
                      const std::map<std::string, std::string>& inplace_map) {
-  TraceOp<VarBase>(type, ins, outs, std::move(attrs), expected_place_,
-                   has_grad_, inplace_map);
+  TraceOp<VarBase>(type,
+                   ins,
+                   outs,
+                   std::move(attrs),
+                   expected_place_,
+                   has_grad_,
+                   inplace_map);
 }
 
-void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+void Tracer::TraceOp(const std::string& type,
+                     const NameTensorMap& ins,
                      const NameTensorMap& outs,
                      paddle::framework::AttributeMap& attrs,
                      const paddle::platform::Place& place,
@@ -383,26 +412,41 @@ void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
           << use_default_attr_map;
-  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, place, false,
-                                  inplace_map, default_attrs,
+  TraceOpImpl<egr::EagerVariable>(type,
+                                  ins,
+                                  outs,
+                                  attrs,
+                                  place,
+                                  false,
+                                  inplace_map,
+                                  default_attrs,
                                   use_default_attr_map);
 }
 
-void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+void Tracer::TraceOp(const std::string& type,
+                     const NameTensorMap& ins,
                      const NameTensorMap& outs,
                      paddle::framework::AttributeMap attrs) {
   VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
-  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
-                                  false, {}, nullptr, true);
+  TraceOpImpl<egr::EagerVariable>(
+      type, ins, outs, attrs, expected_place_, false, {}, nullptr, true);
 }
 
-void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
+void Tracer::TraceOp(const std::string& type,
+                     const NameTensorMap& ins,
                      const NameTensorMap& outs,
                      paddle::framework::AttributeMap& attrs,
                      const std::map<std::string, std::string>& inplace_map) {
   VLOG(6) << "Running On Eager TraceOp(less): ";
-  TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
-                                  false, inplace_map, nullptr, true);
+  TraceOpImpl<egr::EagerVariable>(type,
+                                  ins,
+                                  outs,
+                                  attrs,
+                                  expected_place_,
+                                  false,
+                                  inplace_map,
+                                  nullptr,
+                                  true);
 }
 
 void Tracer::SetExpectedPlace(platform::Place place) {
@@ -434,8 +478,10 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins,
 }
 
 phi::KernelSignature Tracer::GetExpectedKernelSignature(
-    const std::string& type, const NameTensorMap& ins,
-    const NameTensorMap& outs, framework::AttributeMap attrs) const {
+    const std::string& type,
+    const NameTensorMap& ins,
+    const NameTensorMap& outs,
+    framework::AttributeMap attrs) const {
   auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
   framework::RuntimeContext ctx({}, {});
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -451,11 +497,18 @@ phi::KernelSignature Tracer::GetExpectedKernelSignature(
                               : attr_checker->GetDefaultAttrMap();
   auto dygraph_exe_ctx =
       imperative::DygraphExecutionContext<egr::EagerVariable>(
-          *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs,
+          *op,
+          framework::Scope(),
+          *dev_ctx,
+          ctx,
+          ins,
+          outs,
+          attrs,
           default_attrs);
   auto* opbase_with_kernel =
       dynamic_cast<framework::OperatorWithKernel*>(op.get());
-  PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr,
+  PADDLE_ENFORCE_NE(opbase_with_kernel,
+                    nullptr,
                     platform::errors::InvalidArgument(
                         "This op type:`%s` is not a OperatorWithKernel, only "
                         "OperatorWithKernel can get KernelSignature",
diff --git a/paddle/fluid/operators/identity_loss_op.cc b/paddle/fluid/operators/identity_loss_op.cc
index f2b28ba37d339..bc9986c7ffea1 100644
--- a/paddle/fluid/operators/identity_loss_op.cc
+++ b/paddle/fluid/operators/identity_loss_op.cc
@@ -91,14 +91,18 @@ DECLARE_INPLACE_OP_INFERER(IdentityLossGradInplaceInferer,
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-DECLARE_INFER_SHAPE_FUNCTOR(identity_loss, IdentityLossInferShapeFunctor,
+DECLARE_INFER_SHAPE_FUNCTOR(identity_loss,
+                            IdentityLossInferShapeFunctor,
                             PD_INFER_META(phi::IdentityLossInferMeta));
 
-REGISTER_OPERATOR(identity_loss, ops::IdentityLossOp, ops::IdentityLossOpMaker,
+REGISTER_OPERATOR(identity_loss,
+                  ops::IdentityLossOp,
+                  ops::IdentityLossOpMaker,
                   ops::IdentityLossGradMaker<paddle::framework::OpDesc>,
                   ops::IdentityLossGradMaker<paddle::imperative::OpBase>,
                   ops::IdentityLossInplaceInferer,
                   IdentityLossInferShapeFunctor);
 
-REGISTER_OPERATOR(identity_loss_grad, ops::IdentityLossGradOp,
+REGISTER_OPERATOR(identity_loss_grad,
+                  ops::IdentityLossGradOp,
                   ops::IdentityLossGradInplaceInferer);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index fbe3e1ea639d0..c75c36a278cb3 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -93,7 +93,9 @@ struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<float>& vec) const {
     if (dtype_ == VarType::FP16) {
       std::vector<float16> vec_fp16;
-      std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp16),
+      std::transform(vec.begin(),
+                     vec.end(),
+                     std::back_inserter(vec_fp16),
                      [](float f) -> float16 { return float16(f); });
       framework::TensorFromVector<float16>(vec_fp16, tensor_);
     } else {
@@ -109,7 +111,9 @@ struct ConstantOpAttrVisitor : public boost::static_visitor<void> {
   void operator()(const std::vector<double>& vec) const {
     // popart do not support float64 constant
     std::vector<float> vec_fp32;
-    std::transform(vec.begin(), vec.end(), std::back_inserter(vec_fp32),
+    std::transform(vec.begin(),
+                   vec.end(),
+                   std::back_inserter(vec_fp32),
                    [](double f) -> float { return static_cast<float>(f); });
     framework::TensorFromVector<float>(vec_fp32, tensor_);
   }
@@ -362,7 +366,8 @@ void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   for (const auto& fetch_name : fetch_list) {
     auto tensor = resources_->tensors.find(fetch_name);
     PADDLE_ENFORCE_NE(
-        tensor, resources_->tensors.end(),
+        tensor,
+        resources_->tensors.end(),
         platform::errors::NotFound(
             "Output tensor %s is not found, please check the model.",
             fetch_name));
@@ -429,8 +434,9 @@ void Compiler::LowerWeights(const Scope* scope) {
       VLOG(10) << "lowering weight: " << var_name;
       auto var = scope->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(
-          var, platform::errors::NotFound("Tensor %s is not found in the scope",
-                                          var_name));
+          var,
+          platform::errors::NotFound("Tensor %s is not found in the scope",
+                                     var_name));
       auto tensor = var->Get<framework::LoDTensor>();
       auto dtype = PhiDType2PopartDType(tensor.dtype());
       auto shape = std::vector<int64_t>();
@@ -482,9 +488,12 @@ void Compiler::LowerBody() {
       VLOG(10) << "Build graph from custom op: " << __op_type;
       auto it = custom_ops_.find(__op_type);
       NameScopeHelper ns_helper(op_desc, builder_.get());
-      auto output_ids =
-          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
-                             inputs, outputs.size(), attributes, debug_context);
+      auto output_ids = builder_->customOp(it->second.popart_op,
+                                           it->second.popart_op.version,
+                                           inputs,
+                                           outputs.size(),
+                                           attributes,
+                                           debug_context);
       PostLower(output_ids, op_desc);
     } else if (op_type == "popart_printtensor") {
       auto inputs = GetOpInputs(op_desc);
@@ -572,14 +581,17 @@ void Compiler::LowerOptimizer(const Scope* scope) {
               popart::OptimizerValue(momentum, true),
               popart::SGD::getUnsetDampening(),
               popart::SGD::getUnsetVelocityScaling(),
-              popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
+              popart::OptimizerValue(loss_scaling, true),
+              clip_norm_settings);
         };
         resources_->eval_optimizer = std::make_unique<popart::SGD>(
             popart::OptimizerValue(0.0, false),
             popart::OptimizerValue(0.0, false),
-            popart::OptimizerValue(0.0, true), popart::SGD::getUnsetDampening(),
+            popart::OptimizerValue(0.0, true),
+            popart::SGD::getUnsetDampening(),
             popart::SGD::getUnsetVelocityScaling(),
-            popart::OptimizerValue(loss_scaling, true), clip_norm_settings);
+            popart::OptimizerValue(loss_scaling, true),
+            clip_norm_settings);
       } else if (type == "adam") {
         auto weight_decay =
             BOOST_GET_CONST(float, op_desc->GetAttr("weight_decay"));
@@ -609,10 +621,15 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                                    {"defaultEps", {eps, true}},
                                    {"lossScaling", {loss_scaling, true}},
                                    {"defaultMaxWeightNorm", {mwn, true}}};
-            auto optimizer_instance = std::make_unique<popart::Adam>(
-                optimizer_value, adam_mode, weight_decay_mode,
-                popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings, scaled_optimizer_state_);
+            auto optimizer_instance =
+                std::make_unique<popart::Adam>(optimizer_value,
+                                               adam_mode,
+                                               weight_decay_mode,
+                                               popart::DataType::UNDEFINED,
+                                               accl1_type,
+                                               accl2_type,
+                                               clip_norm_settings,
+                                               scaled_optimizer_state_);
             for (int i = 0; i < weight_decay_vars.size(); i++) {
               optimizer_instance->insertSpecific(
                   weight_decay_vars[i],
@@ -629,9 +646,14 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                 popart::OptimizerValue(beta2, false),
                 popart::OptimizerValue(eps, true),
                 popart::OptimizerValue(loss_scaling, true),
-                popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
-                popart::DataType::UNDEFINED, accl1_type, accl2_type,
-                clip_norm_settings, scaled_optimizer_state_);
+                popart::OptimizerValue(mwn, true),
+                adam_mode,
+                weight_decay_mode,
+                popart::DataType::UNDEFINED,
+                accl1_type,
+                accl2_type,
+                clip_norm_settings,
+                scaled_optimizer_state_);
           }
         };
         if (adam_mode == popart::AdamMode::Lamb) {
@@ -642,11 +664,15 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                {"defaultEps", {eps, true}},
                {"lossScaling", {loss_scaling, true}},
                {"defaultMaxWeightNorm", {mwn, true}}};
-          auto eval_optimizer = std::make_unique<popart::Adam>(
-              optimizer_value, adam_mode, weight_decay_mode,
-              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings,
-              scaled_optimizer_state_);
+          auto eval_optimizer =
+              std::make_unique<popart::Adam>(optimizer_value,
+                                             adam_mode,
+                                             weight_decay_mode,
+                                             popart::DataType::UNDEFINED,
+                                             popart::DataType::FLOAT,
+                                             popart::DataType::FLOAT,
+                                             clip_norm_settings,
+                                             scaled_optimizer_state_);
           for (int i = 0; i < weight_decay_vars.size(); i++) {
             eval_optimizer->insertSpecific(weight_decay_vars[i],
                                            {{"weightDecay", {0.0, false}}});
@@ -660,11 +686,15 @@ void Compiler::LowerOptimizer(const Scope* scope) {
                {"defaultEps", {eps, true}},
                {"lossScaling", {loss_scaling, true}},
                {"defaultMaxWeightNorm", {mwn, true}}};
-          auto eval_optimizer = std::make_unique<popart::Adam>(
-              optimizer_value, adam_mode, weight_decay_mode,
-              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings,
-              scaled_optimizer_state_);
+          auto eval_optimizer =
+              std::make_unique<popart::Adam>(optimizer_value,
+                                             adam_mode,
+                                             weight_decay_mode,
+                                             popart::DataType::UNDEFINED,
+                                             popart::DataType::FLOAT,
+                                             popart::DataType::FLOAT,
+                                             clip_norm_settings,
+                                             scaled_optimizer_state_);
           for (int i = 0; i < weight_decay_vars.size(); i++) {
             eval_optimizer->insertSpecific(weight_decay_vars[i],
                                            {{"weightDecay", {0.0, false}}});
@@ -678,9 +708,13 @@ void Compiler::LowerOptimizer(const Scope* scope) {
               popart::OptimizerValue(beta2, false),
               popart::OptimizerValue(eps, true),
               popart::OptimizerValue(loss_scaling, true),
-              popart::OptimizerValue(mwn, true), adam_mode, weight_decay_mode,
-              popart::DataType::UNDEFINED, popart::DataType::FLOAT,
-              popart::DataType::FLOAT, clip_norm_settings,
+              popart::OptimizerValue(mwn, true),
+              adam_mode,
+              weight_decay_mode,
+              popart::DataType::UNDEFINED,
+              popart::DataType::FLOAT,
+              popart::DataType::FLOAT,
+              clip_norm_settings,
               scaled_optimizer_state_);
         }
       } else if (type == "adaptive") {
@@ -705,9 +739,13 @@ void Compiler::LowerOptimizer(const Scope* scope) {
               popart::OptimizerValue(alpha, true),
               popart::OptimizerValue(momentum, true),
               popart::OptimizerValue(eps, true),
-              popart::OptimizerValue(loss_scaling, true), adaptive_mode,
-              weight_decay_mode, popart::DataType::UNDEFINED, accl1_type,
-              accl2_type, accl3_type);
+              popart::OptimizerValue(loss_scaling, true),
+              adaptive_mode,
+              weight_decay_mode,
+              popart::DataType::UNDEFINED,
+              accl1_type,
+              accl2_type,
+              accl3_type);
         };
         resources_->eval_optimizer = std::make_unique<popart::Adaptive>(
             popart::OptimizerValue(0.0, false),
@@ -715,9 +753,12 @@ void Compiler::LowerOptimizer(const Scope* scope) {
             popart::OptimizerValue(alpha, true),
             popart::OptimizerValue(momentum, true),
             popart::OptimizerValue(eps, true),
-            popart::OptimizerValue(loss_scaling, true), adaptive_mode,
-            weight_decay_mode, popart::DataType::UNDEFINED,
-            popart::DataType::FLOAT, popart::DataType::FLOAT,
+            popart::OptimizerValue(loss_scaling, true),
+            adaptive_mode,
+            weight_decay_mode,
+            popart::DataType::UNDEFINED,
+            popart::DataType::FLOAT,
+            popart::DataType::FLOAT,
             popart::DataType::UNDEFINED);
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
@@ -749,7 +790,8 @@ void Compiler::PostLower(const std::vector<std::string>& tensor_ids,
   // Record output tensors
   auto pd_outs = GetOpOutputs(op_desc);
   PADDLE_ENFORCE_EQ(
-      pd_outs.size(), tensor_ids.size(),
+      pd_outs.size(),
+      tensor_ids.size(),
       platform::errors::Fatal("paddle and popart op have different outputs"));
   for (int i = 0; i < tensor_ids.size(); ++i) {
     resources_->tensors.emplace(pd_outs[i], tensor_ids[i]);
@@ -763,13 +805,15 @@ void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc) {
   // Record output tensor
   auto pd_outs = GetOpOutputs(op_desc);
   PADDLE_ENFORCE_EQ(
-      pd_outs.size(), 1,
+      pd_outs.size(),
+      1,
       platform::errors::Fatal("paddle and popart op have different outputs"));
   resources_->tensors.emplace(pd_outs[0], tensor_id);
   PostLower(tensor_id, op_desc, false);
 }
 
-void Compiler::PostLower(const std::string& tensor_id, const OpDesc* op_desc,
+void Compiler::PostLower(const std::string& tensor_id,
+                         const OpDesc* op_desc,
                          bool skip_pipline) {
   // Set pipline
   if (!skip_pipline && op_desc->HasAttr(sIpuIndexAttr)) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 43c6866296a61..279914a0b931b 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -76,8 +76,10 @@ class PdIArray final : public popart::IArray {
   std::size_t rank() const { return tensor_.dims().size(); }
   int64_t dim(size_t index) const { return tensor_.dims().at(index); }
   std::size_t nelms() const {
-    return std::accumulate(shape_.begin(), shape_.end(),
-                           static_cast<int64_t>(1), std::multiplies<int64_t>());
+    return std::accumulate(shape_.begin(),
+                           shape_.end(),
+                           static_cast<int64_t>(1),
+                           std::multiplies<int64_t>());
   }
   const popart::Shape shape() const { return shape_; }
 
@@ -108,14 +110,23 @@ void Executor::Prepare(const std::string &proto) {
     VLOG(10) << "Creating TrainingSession from Onnx Model...";
     auto optimizer = compiler_resources_->NewOptimizer();
     session_ = popart::TrainingSession::createFromOnnxModel(
-        proto, dataFlow, compiler_resources_->loss_var, *optimizer, device_,
-        popart::InputShapeInfo(), ipu_strategy_->popart_options,
+        proto,
+        dataFlow,
+        compiler_resources_->loss_var,
+        *optimizer,
+        device_,
+        popart::InputShapeInfo(),
+        ipu_strategy_->popart_options,
         ipu_strategy_->popart_patterns);
   } else {
     VLOG(10) << "Creating InferenceSession from Onnx Model...";
     session_ = popart::InferenceSession::createFromOnnxModel(
-        proto, dataFlow, device_, popart::InputShapeInfo(),
-        ipu_strategy_->popart_options, ipu_strategy_->popart_patterns);
+        proto,
+        dataFlow,
+        device_,
+        popart::InputShapeInfo(),
+        ipu_strategy_->popart_options,
+        ipu_strategy_->popart_patterns);
   }
   VLOG(10) << "Creating session from Onnx Model...done";
 
@@ -287,8 +298,9 @@ void Executor::AcquireDevice() {
         popart::DeviceManager::createDeviceManager().acquireAvailableDevice(
             RequestIpus(ipu_strategy_->num_ipus));
     PADDLE_ENFORCE_NOT_NULL(
-        device_, errors::Unavailable("Can't attach IPU, ipu_num = %d.",
-                                     RequestIpus(ipu_strategy_->num_ipus)));
+        device_,
+        errors::Unavailable("Can't attach IPU, ipu_num = %d.",
+                            RequestIpus(ipu_strategy_->num_ipus)));
     VLOG(10) << "Create IPU device...done";
   }
   VLOG(10) << "leave Executor::AcquireDevice";
@@ -377,19 +389,23 @@ void Executor::ConvertWeights(bool align_to_popart) {
       auto num_elem = info.nelms();
       if (align_to_popart) {
         std::vector<uint16_t> fp16_data;
-        std::transform(data_ptr, data_ptr + num_elem,
+        std::transform(data_ptr,
+                       data_ptr + num_elem,
                        std::back_inserter(fp16_data),
                        [&](float elem) { return popart::floatToHalf(elem); });
-        memcpy(reinterpret_cast<void *>(data_ptr), fp16_data.data(),
+        memcpy(reinterpret_cast<void *>(data_ptr),
+               fp16_data.data(),
                num_elem * sizeof(float16));
       } else {
         std::vector<float> fp32_data;
         auto fp16_data_ptr = reinterpret_cast<uint16_t *>(data_ptr);
-        std::transform(fp16_data_ptr, fp16_data_ptr + num_elem,
-                       std::back_inserter(fp32_data), [&](uint16_t elem) {
-                         return popart::halfToFloat(elem);
-                       });
-        memcpy(reinterpret_cast<void *>(data_ptr), fp32_data.data(),
+        std::transform(
+            fp16_data_ptr,
+            fp16_data_ptr + num_elem,
+            std::back_inserter(fp32_data),
+            [&](uint16_t elem) { return popart::halfToFloat(elem); });
+        memcpy(reinterpret_cast<void *>(data_ptr),
+               fp32_data.data(),
                num_elem * sizeof(float));
       }
     } else {
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 708ea13177e7b..5acd075a6155f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -19,7 +19,8 @@ namespace {
 template <typename Value, typename Lambda>
 void RegisterSetter(
     std::map<std::string, std::function<void(Value)>>& options,  // NOLINT
-    const std::string& name, Lambda setter) {
+    const std::string& name,
+    Lambda setter) {
   options[name] = setter;
 }
 
@@ -27,7 +28,9 @@ template <typename Value, typename Lambda>
 void RegisterGetter(
     std::map<std::string, std::function<Value()>>& options,  // NOLINT
     std::map<std::string, std::string>& options_type,        // NOLINT
-    const std::string& name, const std::string& type_str, Lambda getter) {
+    const std::string& name,
+    const std::string& type_str,
+    Lambda getter) {
   options[name] = getter;
   options_type[name] = type_str;
 }
@@ -55,25 +58,28 @@ namespace ipu {
 IpuStrategy::IpuStrategy() {
 #define ADD_BOOL_OPTION(name)                                             \
   RegisterSetter(bool_options, #name, [&](bool value) { name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "bool",             \
-                 [&]() { return std::to_string(name); })
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {     \
+    return std::to_string(name);                                          \
+  })
 
-#define ADD_UINT64_OPTION(name)                                 \
-  RegisterSetter(uint64_options, #name,                         \
-                 [&](std::uint64_t value) { name = value; });   \
-  RegisterGetter(options_getter, options_type, #name, "uint64", \
-                 [&]() { return std::to_string(name); })
+#define ADD_UINT64_OPTION(name)                                           \
+  RegisterSetter(                                                         \
+      uint64_options, #name, [&](std::uint64_t value) { name = value; }); \
+  RegisterGetter(options_getter, options_type, #name, "uint64", [&]() {   \
+    return std::to_string(name);                                          \
+  })
 
 #define ADD_DOUBLE_OPTION(name)                                               \
   RegisterSetter(double_options, #name, [&](double value) { name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "double",               \
-                 [&]() { return std::to_string(name); })
+  RegisterGetter(options_getter, options_type, #name, "double", [&]() {       \
+    return std::to_string(name);                                              \
+  })
 
-#define ADD_STRING_OPTION(name)                                    \
-  RegisterSetter(string_options, #name,                            \
-                 [&](const std::string& value) { name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "string",    \
-                 [&]() { return name; })
+#define ADD_STRING_OPTION(name)                                                \
+  RegisterSetter(                                                              \
+      string_options, #name, [&](const std::string& value) { name = value; }); \
+  RegisterGetter(                                                              \
+      options_getter, options_type, #name, "string", [&]() { return name; })
 
   ADD_BOOL_OPTION(is_training);
   ADD_BOOL_OPTION(need_avg_shard);
@@ -103,11 +109,12 @@ IpuStrategy::IpuStrategy() {
 #undef ADD_UINT64_OPTION
 #undef ADD_BOOL_OPTION
 
-#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name)                          \
-  RegisterSetter(bool_options, #name,                                        \
-                 [&](bool value) { runtime_options.aliased_name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {        \
-    return std::to_string(runtime_options.aliased_name);                     \
+#define ADD_RUNTIME_BOOL_OPTION(name, aliased_name)                   \
+  RegisterSetter(bool_options, #name, [&](bool value) {               \
+    runtime_options.aliased_name = value;                             \
+  });                                                                 \
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \
+    return std::to_string(runtime_options.aliased_name);              \
   })
 
   ADD_RUNTIME_BOOL_OPTION(runtime_options.enable_eval, enable_eval);
@@ -117,7 +124,8 @@ IpuStrategy::IpuStrategy() {
 #define ADD_POPART_ENUM_OPTION_ALIAS(name, aliased_name, EnumType)        \
   RegisterSetter(uint64_options, #name, [&](std::uint64_t value) {        \
     PADDLE_ENFORCE_LT(                                                    \
-        value, static_cast<std::uint64_t>(popart::EnumType::N),           \
+        value,                                                            \
+        static_cast<std::uint64_t>(popart::EnumType::N),                  \
         errors::InvalidArgument("Value for %s out of range", #EnumType)); \
     popart_options.aliased_name = static_cast<popart::EnumType>(value);   \
   });                                                                     \
@@ -126,11 +134,12 @@ IpuStrategy::IpuStrategy() {
         static_cast<std::uint64_t>(popart_options.aliased_name));         \
   })
 
-#define ADD_POPART_BOOL_OPTION_ALIAS(name, aliased_name)                    \
-  RegisterSetter(bool_options, #name,                                       \
-                 [&](bool value) { popart_options.aliased_name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "bool", [&]() {       \
-    return std::to_string(popart_options.aliased_name);                     \
+#define ADD_POPART_BOOL_OPTION_ALIAS(name, aliased_name)              \
+  RegisterSetter(bool_options, #name, [&](bool value) {               \
+    popart_options.aliased_name = value;                              \
+  });                                                                 \
+  RegisterGetter(options_getter, options_type, #name, "bool", [&]() { \
+    return std::to_string(popart_options.aliased_name);               \
   })
 
 #define ADD_POPART_UINT64_OPTION_ALIAS(name, aliased_name)              \
@@ -141,19 +150,21 @@ IpuStrategy::IpuStrategy() {
     return std::to_string(popart_options.aliased_name);                 \
   })
 
-#define ADD_POPART_DOUBLE_OPTION_ALIAS(name, aliased_name)                    \
-  RegisterSetter(double_options, #name,                                       \
-                 [&](double value) { popart_options.aliased_name = value; }); \
-  RegisterGetter(options_getter, options_type, #name, "double", [&]() {       \
-    return std::to_string(popart_options.aliased_name);                       \
+#define ADD_POPART_DOUBLE_OPTION_ALIAS(name, aliased_name)              \
+  RegisterSetter(double_options, #name, [&](double value) {             \
+    popart_options.aliased_name = value;                                \
+  });                                                                   \
+  RegisterGetter(options_getter, options_type, #name, "double", [&]() { \
+    return std::to_string(popart_options.aliased_name);                 \
   })
 
 #define ADD_POPART_STRING_OPTION_ALIAS(name, aliased_name)              \
   RegisterSetter(string_options, #name, [&](const std::string& value) { \
     popart_options.aliased_name = value;                                \
   });                                                                   \
-  RegisterGetter(options_getter, options_type, #name, "string",         \
-                 [&]() { return popart_options.aliased_name; })
+  RegisterGetter(options_getter, options_type, #name, "string", [&]() { \
+    return popart_options.aliased_name;                                 \
+  })
 
   ADD_POPART_ENUM_OPTION_ALIAS(autodiff_settings.stitch_strategy,
                                autodiffSettings.stitchStrategy,
@@ -167,14 +178,14 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_ENUM_OPTION_ALIAS(batch_serialization_settings.batch_schedule,
                                batchSerializationSettings.batchSchedule,
                                BatchSerializationBatchSchedule);
-  ADD_POPART_ENUM_OPTION_ALIAS(auto_recomputation, autoRecomputation,
-                               RecomputationType);
-  ADD_POPART_ENUM_OPTION_ALIAS(merge_var_update, mergeVarUpdate,
-                               MergeVarUpdateType);
-  ADD_POPART_ENUM_OPTION_ALIAS(virtual_graph_mode, virtualGraphMode,
-                               VirtualGraphMode);
-  ADD_POPART_ENUM_OPTION_ALIAS(synthetic_data_mode, syntheticDataMode,
-                               SyntheticDataMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      auto_recomputation, autoRecomputation, RecomputationType);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      merge_var_update, mergeVarUpdate, MergeVarUpdateType);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      virtual_graph_mode, virtualGraphMode, VirtualGraphMode);
+  ADD_POPART_ENUM_OPTION_ALIAS(
+      synthetic_data_mode, syntheticDataMode, SyntheticDataMode);
   ADD_POPART_ENUM_OPTION_ALIAS(subgraph_copying_strategy,
                                subgraphCopyingStrategy,
                                SubgraphCopyingStrategy);
@@ -183,7 +194,8 @@ IpuStrategy::IpuStrategy() {
                                ReductionType);
   ADD_POPART_ENUM_OPTION_ALIAS(
       mean_accumulation_and_replication_reduction_strategy,
-      meanAccumulationAndReplicationReductionStrategy, MeanReductionStrategy);
+      meanAccumulationAndReplicationReductionStrategy,
+      MeanReductionStrategy);
 
   ADD_POPART_STRING_OPTION_ALIAS(log_dir, logDir);
   ADD_POPART_STRING_OPTION_ALIAS(cache_path, cachePath);
@@ -309,14 +321,14 @@ IpuStrategy::IpuStrategy() {
 #undef ADD_POPART_BOOL_OPTION_ALIAS
 #undef ADD_POPART_ENUM_OPTION_ALIAS
 
-  RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector",
-                 [&]() {
-                   std::vector<std::string> res;
-                   for (auto x : custom_ops) {
-                     res.push_back(x.repr());
-                   }
-                   return res;
-                 });
+  RegisterGetter(
+      vector_options_getter, options_type, "custom_ops", "vector", [&]() {
+        std::vector<std::string> res;
+        for (auto x : custom_ops) {
+          res.push_back(x.repr());
+        }
+        return res;
+      });
 
   RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) {
     if (value) {
@@ -326,11 +338,11 @@ IpuStrategy::IpuStrategy() {
     }
   });
 
-  RegisterGetter(options_getter, options_type, "enable_manual_shard", "bool",
-                 [&]() {
-                   return std::to_string(popart_options.virtualGraphMode ==
-                                         popart::VirtualGraphMode::Manual);
-                 });
+  RegisterGetter(
+      options_getter, options_type, "enable_manual_shard", "bool", [&]() {
+        return std::to_string(popart_options.virtualGraphMode ==
+                              popart::VirtualGraphMode::Manual);
+      });
 
   RegisterSetter(bool_options, "enable_half_partial", [&](bool value) {
     if (value) {
@@ -347,10 +359,11 @@ IpuStrategy::IpuStrategy() {
         return std::to_string(popart_options.partialsTypeMatMuls == "half");
       });
 
-  RegisterSetter(container_options, "dot_checks",
+  RegisterSetter(container_options,
+                 "dot_checks",
                  [&](const std::pair<std::string, std::string>& p) {
-                   std::vector<std::string> valid_dot{"Fwd0", "Fwd1", "Bwd0",
-                                                      "PreAlias", "Final"};
+                   std::vector<std::string> valid_dot{
+                       "Fwd0", "Fwd1", "Bwd0", "PreAlias", "Final"};
                    if (std::find(valid_dot.begin(), valid_dot.end(), p.first) ==
                        valid_dot.end()) {
                      PADDLE_THROW(platform::errors::InvalidArgument(
@@ -359,16 +372,17 @@ IpuStrategy::IpuStrategy() {
                    popart_options.dotChecks.insert(p.first);
                  });
 
-  RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector",
-                 [&]() {
-                   std::vector<std::string> res;
-                   for (auto x : popart_options.dotChecks) {
-                     res.push_back(x);
-                   }
-                   return res;
-                 });
+  RegisterGetter(
+      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
+        std::vector<std::string> res;
+        for (auto x : popart_options.dotChecks) {
+          res.push_back(x);
+        }
+        return res;
+      });
 
-  RegisterSetter(container_options, "hardware_instrumentations",
+  RegisterSetter(container_options,
+                 "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
                    std::uint64_t value = std::stoul(p.first);
                    popart_options.hardwareInstrumentations.insert(
@@ -376,8 +390,11 @@ IpuStrategy::IpuStrategy() {
                  });
 
   RegisterGetter(
-      vector_options_getter, options_type, "hardware_instrumentations",
-      "vector", [&]() {
+      vector_options_getter,
+      options_type,
+      "hardware_instrumentations",
+      "vector",
+      [&]() {
         std::vector<std::string> res;
         for (auto x : popart_options.hardwareInstrumentations) {
           res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
@@ -385,59 +402,74 @@ IpuStrategy::IpuStrategy() {
         return res;
       });
 
-  RegisterSetter(container_options, "custom_codelets",
+  RegisterSetter(container_options,
+                 "custom_codelets",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.customCodelets.push_back(p.first);
                  });
 
-  RegisterGetter(vector_options_getter, options_type, "custom_codelets",
-                 "vector", [&]() {
-                   std::vector<std::string> res;
-                   for (auto x : popart_options.customCodelets) {
-                     res.push_back(x);
-                   }
-                   return res;
-                 });
+  RegisterGetter(
+      vector_options_getter, options_type, "custom_codelets", "vector", [&]() {
+        std::vector<std::string> res;
+        for (auto x : popart_options.customCodelets) {
+          res.push_back(x);
+        }
+        return res;
+      });
 
-  RegisterSetter(container_options, "engine_options",
+  RegisterSetter(container_options,
+                 "engine_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.engineOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "engine_options", "map",
-                 [&]() { return popart_options.engineOptions; });
+  RegisterGetter(
+      map_options_getter, options_type, "engine_options", "map", [&]() {
+        return popart_options.engineOptions;
+      });
 
-  RegisterSetter(container_options, "report_options",
+  RegisterSetter(container_options,
+                 "report_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.reportOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "report_options", "map",
-                 [&]() { return popart_options.reportOptions; });
+  RegisterGetter(
+      map_options_getter, options_type, "report_options", "map", [&]() {
+        return popart_options.reportOptions;
+      });
 
-  RegisterSetter(container_options, "convolution_options",
+  RegisterSetter(container_options,
+                 "convolution_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.convolutionOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "convolution_options", "map",
-                 [&]() { return popart_options.convolutionOptions; });
+  RegisterGetter(
+      map_options_getter, options_type, "convolution_options", "map", [&]() {
+        return popart_options.convolutionOptions;
+      });
 
-  RegisterSetter(container_options, "lstm_options",
+  RegisterSetter(container_options,
+                 "lstm_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.lstmOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "lstm_options", "map",
-                 [&]() { return popart_options.lstmOptions; });
+  RegisterGetter(
+      map_options_getter, options_type, "lstm_options", "map", [&]() {
+        return popart_options.lstmOptions;
+      });
 
-  RegisterSetter(container_options, "gcl_options",
+  RegisterSetter(container_options,
+                 "gcl_options",
                  [&](const std::pair<std::string, std::string>& p) {
                    popart_options.gclOptions.emplace(p);
                  });
 
-  RegisterGetter(map_options_getter, options_type, "gcl_options", "map",
-                 [&]() { return popart_options.gclOptions; });
+  RegisterGetter(map_options_getter, options_type, "gcl_options", "map", [&]() {
+    return popart_options.gclOptions;
+  });
 
   // Default options
 
@@ -465,15 +497,19 @@ void IpuStrategy::AddStringOption(const std::string& option,
 
 void IpuStrategy::InsertStringOption(const std::string& option,
                                      const std::string& value) {
-  set(option, std::pair<std::string, std::string>(value, ""), container_options,
+  set(option,
+      std::pair<std::string, std::string>(value, ""),
+      container_options,
       "vector");
 }
 
 void IpuStrategy::InsertStringPairOption(const std::string& option,
                                          const std::string& key,
                                          const std::string& value) {
-  set(option, std::pair<std::string, std::string>(key, value),
-      container_options, "map");
+  set(option,
+      std::pair<std::string, std::string>(key, value),
+      container_options,
+      "map");
 }
 
 void IpuStrategy::SetTensorLocation(const std::string& tensor,
@@ -553,7 +589,8 @@ void IpuStrategy::SetAccumulateOuterFragmentSettings(
 
 void IpuStrategy::AddCustomOp(const std::string& paddle_op,
                               const std::string& popart_op,
-                              const std::string& domain, int version) {
+                              const std::string& domain,
+                              int version) {
   LOG(INFO) << "IpuStrategy add custom op: " << paddle_op;
   custom_ops.push_back(
       IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 2b73b70b56ddf..7207e9199765f 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -121,15 +121,19 @@ class IpuStrategy {
   void AddDoubleOption(const std::string &option, double value);
   void AddStringOption(const std::string &option, const std::string &value);
   void InsertStringOption(const std::string &option, const std::string &value);
-  void InsertStringPairOption(const std::string &option, const std::string &key,
+  void InsertStringPairOption(const std::string &option,
+                              const std::string &key,
                               const std::string &value);
-  void SetTensorLocation(const std::string &tensor, const std::string &option,
+  void SetTensorLocation(const std::string &tensor,
+                         const std::string &option,
                          std::uint64_t value);
   void SetReplicatedCollectivesSettings(const std::string &opt, bool value);
   void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
                                           const std::vector<int> &values);
-  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
-                   const std::string &domain, int version);
+  void AddCustomOp(const std::string &paddle_op,
+                   const std::string &popart_op,
+                   const std::string &domain,
+                   int version);
   void SetCompilationProgressLogger(
       const std::function<void(int, int)> &logger);
 
@@ -146,15 +150,18 @@ class IpuStrategy {
  private:
   template <typename ValueType>
   void set(
-      const std::string &key, ValueType value,
+      const std::string &key,
+      ValueType value,
       std::map<std::string, std::function<void(ValueType)>> &options,  // NOLINT
       const std::string &type_str) {
     auto it = options.find(key);
     PADDLE_ENFORCE_NE(
-        it, options.end(),
+        it,
+        options.end(),
         platform::errors::InvalidArgument("Cannot find option: %s, type: %s "
                                           "when setting IpuStrategy options",
-                                          key, type_str));
+                                          key,
+                                          type_str));
     it->second(value);
   }
 
@@ -164,7 +171,8 @@ class IpuStrategy {
       std::map<std::string, std::function<ValueType()>> &options) {  // NOLINT
     auto it = options.find(key);
     PADDLE_ENFORCE_NE(
-        it, options.end(),
+        it,
+        options.end(),
         platform::errors::InvalidArgument(
             "Cannot find option name: %s when trying to get IpuStrategy option",
             key));
diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
index 997fd9be070cb..1e9291cf57256 100644
--- a/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
+++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/other_ops.cc
@@ -25,8 +25,8 @@ Node *custom_op_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto attrs = op->GetAttrMap();
   attrs.insert({"__op_type", node->Op()->Type()});
-  auto new_node = CreateBaseOp(graph, node, "popart_custom_op", node->inputs,
-                               node->outputs, attrs);
+  auto new_node = CreateBaseOp(
+      graph, node, "popart_custom_op", node->inputs, node->outputs, attrs);
   return new_node;
 }
 
@@ -43,15 +43,15 @@ Node *print_handler(Graph *graph, Node *node) {
   }
   auto attrs =
       AttributeMap{{"print_gradient", print_gradient}, {"title", title}};
-  return CreateBaseOp(graph, node, "popart_printtensor", node->inputs,
-                      node->outputs, attrs);
+  return CreateBaseOp(
+      graph, node, "popart_printtensor", node->inputs, node->outputs, attrs);
 }
 
 Node *popart_optimizer_handler(Graph *graph, Node *node) { return nullptr; }
 
 Node *checkpointoutput_handler(Graph *graph, Node *node) {
-  return CreateBaseOp(graph, node, "popart_checkpointoutput", node->inputs,
-                      node->outputs);
+  return CreateBaseOp(
+      graph, node, "popart_checkpointoutput", node->inputs, node->outputs);
 }
 
 Node *custom_nll_loss_handler(Graph *graph, Node *node) {
@@ -61,12 +61,18 @@ Node *custom_nll_loss_handler(Graph *graph, Node *node) {
   auto inputIsLogProbability =
       BOOST_GET_CONST(bool, op->GetAttr("inputIsLogProbability"));
   if (ignoreIndex == "None") {
-    return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs,
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_nllloss_v2",
+                        node->inputs,
                         node->outputs,
                         {{"reduction", reduction},
                          {"inputIsLogProbability", inputIsLogProbability}});
   } else {
-    return CreateBaseOp(graph, node, "popart_nllloss_v2", node->inputs,
+    return CreateBaseOp(graph,
+                        node,
+                        "popart_nllloss_v2",
+                        node->inputs,
                         node->outputs,
                         {{"reduction", reduction},
                          {"ignoreIndex", std::atoi(ignoreIndex.c_str())},
@@ -75,20 +81,24 @@ Node *custom_nll_loss_handler(Graph *graph, Node *node) {
 }
 
 Node *identity_handler(Graph *graph, Node *node) {
-  return CreateBaseOp(graph, node, "popart_identity", node->inputs,
-                      node->outputs);
+  return CreateBaseOp(
+      graph, node, "popart_identity", node->inputs, node->outputs);
 }
 
 Node *identity_loss_handler(Graph *graph, Node *node) {
   auto *op = node->Op();
   auto reduction = BOOST_GET_CONST(int, op->GetAttr("reduction"));
-  return CreateBaseOp(graph, node, "popart_identity_loss", node->inputs,
-                      node->outputs, {{"reduction", reduction}});
+  return CreateBaseOp(graph,
+                      node,
+                      "popart_identity_loss",
+                      node->inputs,
+                      node->outputs,
+                      {{"reduction", reduction}});
 }
 
 Node *detach_handler(Graph *graph, Node *node) {
-  return CreateBaseOp(graph, node, "popart_detach_v2", node->inputs,
-                      node->outputs);
+  return CreateBaseOp(
+      graph, node, "popart_detach_v2", node->inputs, node->outputs);
 }
 
 }  // namespace
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index f742b0716e284..6f6950f9cd1c4 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -56,7 +56,8 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
     auto& desired_dev_ctx =
         static_cast<const platform::CUDADeviceContext&>(dev_ctx);
     if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size,
+      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(),
+                                   size,
                                    phi::Stream(reinterpret_cast<phi::StreamId>(
                                        desired_dev_ctx.stream())));
     } else {
@@ -226,11 +227,15 @@ template <typename DevCtx>
 inline void EmplaceDeviceContext(
     std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
         place_to_device_context,
-    platform::Place place, bool disable_setting_default_stream_for_allocator) {
+    platform::Place place,
+    bool disable_setting_default_stream_for_allocator) {
   // lazy evaluation. i.e., only create device context at first `Get`
   place_to_device_context->emplace(
-      place, std::async(std::launch::deferred, CreateDeviceContext<DevCtx>,
-                        place, disable_setting_default_stream_for_allocator));
+      place,
+      std::async(std::launch::deferred,
+                 CreateDeviceContext<DevCtx>,
+                 place,
+                 disable_setting_default_stream_for_allocator));
 }
 
 void EmplaceDeviceContexts(
@@ -239,7 +244,8 @@ void EmplaceDeviceContexts(
     const std::vector<platform::Place>& places,
     bool disable_setting_default_stream_for_allocator) {
   PADDLE_ENFORCE_GT(
-      places.size(), 0,
+      places.size(),
+      0,
       platform::errors::InvalidArgument("The number of platform places should "
                                         "be larger than 0. But received %d.",
                                         places.size()));
@@ -253,17 +259,20 @@ void EmplaceDeviceContexts(
     if (platform::is_cpu_place(p)) {
 #ifdef PADDLE_WITH_MKLDNN
       EmplaceDeviceContext<MKLDNNDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       EmplaceDeviceContext<CPUDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #endif
     } else if (platform::is_gpu_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<CUDADeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
@@ -273,7 +282,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_cuda_pinned_place(p)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -283,7 +293,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_xpu_place(p)) {
 #ifdef PADDLE_WITH_XPU
       EmplaceDeviceContext<XPUDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
@@ -293,7 +304,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_mlu_place(p)) {
 #ifdef PADDLE_WITH_MLU
       EmplaceDeviceContext<MLUDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
@@ -303,7 +315,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_ipu_place(p)) {
 #ifdef PADDLE_WITH_IPU
       EmplaceDeviceContext<IPUDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(
@@ -313,7 +326,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_npu_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
       EmplaceDeviceContext<NPUDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -323,7 +337,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_npu_pinned_place(p)) {
 #ifdef PADDLE_WITH_ASCEND_CL
       EmplaceDeviceContext<NPUPinnedDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -334,7 +349,8 @@ void EmplaceDeviceContexts(
     } else if (platform::is_custom_place(p)) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
       EmplaceDeviceContext<CustomDeviceContext>(
-          place_to_device_context, p,
+          place_to_device_context,
+          p,
           disable_setting_default_stream_for_allocator);
 #else
       PADDLE_THROW(platform::errors::Unimplemented(
@@ -348,7 +364,8 @@ void EmplaceDeviceContexts(
 
 DeviceContextPool::DeviceContextPool(
     const std::vector<platform::Place>& places) {
-  EmplaceDeviceContexts(&device_contexts_, places,
+  EmplaceDeviceContexts(&device_contexts_,
+                        places,
                         /*disable_setting_default_stream_for_allocator=*/false);
 }
 
@@ -403,8 +420,8 @@ NPUDeviceContext::~NPUDeviceContext() {
 }
 
 void NPUDeviceContext::Wait() const {
-  platform::RecordEvent record_event("NPUDeviceContext/wait",
-                                     platform::TracerEventType::UserDefined, 2);
+  platform::RecordEvent record_event(
+      "NPUDeviceContext/wait", platform::TracerEventType::UserDefined, 2);
   VLOG(4) << "NPU context(" << this << ")  Wait";
   stream_->Wait();
 }
@@ -847,7 +864,8 @@ void MKLDNNDeviceContext::ResetBlobMap(void* ptr) {
     VLOG(3) << "Prevented Clearing DNNL cache. Updated "
                "block_next_cache_clearing_ : "
             << block_next_cache_clearing_;
-    PADDLE_ENFORCE_GE(block_next_cache_clearing_, 0,
+    PADDLE_ENFORCE_GE(block_next_cache_clearing_,
+                      0,
                       platform::errors::InvalidArgument(
                           "Cache clearing mark should be non-negative "
                           ". But received %d.",
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index ce8b825b13fe8..569890fa25cd6 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -101,8 +101,8 @@ class PyVariableWrapperHook : public imperative::VariableWrapperHook {
     // 2. call hook and return
     PyObject *res = nullptr;
     try {
-      res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(),
-                                         nullptr);
+      res = PyObject_CallFunctionObjArgs(
+          py_func_, py::cast(tmp_varbase).ptr(), nullptr);
     } catch (platform::EnforceNotMet &e) {
       throw std::move(e);
     } catch (std::exception &e) {
@@ -159,8 +159,10 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) {
 }
 
 // only initialize varbase, but not its tensor.
-static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name,
-                            bool persistable = false, int stop_gradient = -1) {
+static void InitVarBaseOnly(imperative::VarBase *self,
+                            const std::string &name,
+                            bool persistable = false,
+                            int stop_gradient = -1) {
   auto name_ = name == "" ? imperative::GetCurrentTracer()->GenerateUniqueName(
                                 "generated_tensor")
                           : name;
@@ -177,10 +179,13 @@ static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name,
 }
 
 // initialize varbase and its tensor.
-static void InitVarBaseAndTensor(
-    imperative::VarBase *self, const py::array &array,
-    const platform::Place &place, const std::string &name,
-    bool persistable = false, bool zero_copy = false, int stop_gradient = -1) {
+static void InitVarBaseAndTensor(imperative::VarBase *self,
+                                 const py::array &array,
+                                 const platform::Place &place,
+                                 const std::string &name,
+                                 bool persistable = false,
+                                 bool zero_copy = false,
+                                 int stop_gradient = -1) {
   InitVarBaseOnly(self, name, persistable, stop_gradient);
   auto *tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
   VLOG(4) << "zero_copy: " << zero_copy;
@@ -191,8 +196,8 @@ static void InitVarBaseAndTensor(
   } else if (platform::is_gpu_place(place)) {
     SetTensorFromPyArray<platform::CUDAPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_cuda_pinned_place(place)) {
-    SetTensorFromPyArray<platform::CUDAPinnedPlace>(tensor, array, place,
-                                                    zero_copy);
+    SetTensorFromPyArray<platform::CUDAPinnedPlace>(
+        tensor, array, place, zero_copy);
   } else if (platform::is_npu_place(place)) {
     SetTensorFromPyArray<platform::NPUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_ipu_place(place)) {
@@ -200,8 +205,8 @@ static void InitVarBaseAndTensor(
   } else if (platform::is_mlu_place(place)) {
     SetTensorFromPyArray<platform::MLUPlace>(tensor, array, place, zero_copy);
   } else if (platform::is_custom_place(place)) {
-    SetTensorFromPyArray<platform::CustomPlace>(tensor, array, place,
-                                                zero_copy);
+    SetTensorFromPyArray<platform::CustomPlace>(
+        tensor, array, place, zero_copy);
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Place should be one of "
@@ -231,8 +236,8 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
     // ignored
     auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"])
                                           : default_place;
-    InitVarBaseAndTensor(self, array, place, name, persistable, zero_copy,
-                         stop_gradient);
+    InitVarBaseAndTensor(
+        self, array, place, name, persistable, zero_copy, stop_gradient);
   } else {
     InitVarBaseOnly(self, name, persistable, stop_gradient);
   }
@@ -240,7 +245,8 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self,
 
 template <typename P>
 static void InitVarBaseFromNumpyWithArg(imperative::VarBase *self,
-                                        const py::array &array, const P &place,
+                                        const py::array &array,
+                                        const P &place,
                                         bool persistable = false,
                                         bool zero_copy = false,
                                         std::string name = "",
@@ -400,7 +406,8 @@ static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
   }
 
   PADDLE_ENFORCE_EQ(
-      PyErr_Occurred(), nullptr,
+      PyErr_Occurred(),
+      nullptr,
       platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred()))));
   return result;
 }
@@ -422,7 +429,8 @@ paddle::imperative::NameTensorMap ConvertToNameTensorMap(
   }
 
   PADDLE_ENFORCE_EQ(
-      PyErr_Occurred(), nullptr,
+      PyErr_Occurred(),
+      nullptr,
       platform::errors::InvalidArgument(py::str(py::handle(PyErr_Occurred()))));
   return result;
 }
@@ -430,7 +438,8 @@ paddle::imperative::NameTensorMap ConvertToNameTensorMap(
 template <typename P>
 static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
                         imperative::VarBase &dst,                   // NOLINT
-                        const P &dst_device, const bool blocking) {
+                        const P &dst_device,
+                        const bool blocking) {
   if (dst.SharedVar()->IsEmpty()) {
     VLOG(3) << "deep copy Variable from " << src->Name() << " to "
             << dst.Name();
@@ -457,7 +466,8 @@ static void VarBaseCopy(std::shared_ptr<imperative::VarBase> &src,  // NOLINT
             dst.MutableVar()->GetMutable<phi::SelectedRows>();
         dst_selected_rows->set_height(src_selected_rows.height());
         dst_selected_rows->set_rows(src_selected_rows.rows());
-        framework::TensorCopy(src_selected_rows.value(), dst_device,
+        framework::TensorCopy(src_selected_rows.value(),
+                              dst_device,
                               dst_selected_rows->mutable_value());
         if (blocking) {
           platform::DeviceContextPool::Instance().Get(dst_device)->Wait();
@@ -493,7 +503,8 @@ void BindImperative(py::module *m_ptr) {
   // Dygraph DataLoader signal handler
   m.def("_set_process_pids", [](int64_t key, py::object &obj) {
     PADDLE_ENFORCE_EQ(
-        py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj), true,
+        py::isinstance<py::tuple>(obj) || py::isinstance<py::list>(obj),
+        true,
         platform::errors::InvalidArgument(
             "The subprocess ids set in DataLoader is illegal."
             "Expected data type is tuple or list, but received %s",
@@ -528,7 +539,8 @@ void BindImperative(py::module *m_ptr) {
           // 1. cast to python array
           auto array = batch[i].cast<py::array>();
           PADDLE_ENFORCE_NE(
-              string::Sprintf("%s", array.dtype()).compare("object"), 0,
+              string::Sprintf("%s", array.dtype()).compare("object"),
+              0,
               platform::errors::InvalidArgument(
                   "Faild to convert input data to a regular ndarray.\n  * "
                   "Usually this means the input data contains nested "
@@ -537,8 +549,8 @@ void BindImperative(py::module *m_ptr) {
                   "_generator' to locate the data causes this issue."));
           // 2. construcct LoDTensor
           framework::LoDTensor t;
-          SetTensorFromPyArray<platform::CPUPlace>(&t, array,
-                                                   platform::CPUPlace(), true);
+          SetTensorFromPyArray<platform::CPUPlace>(
+              &t, array, platform::CPUPlace(), true);
           // 3. allocate shared memory
           void *data_ptr = t.data();
           size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
@@ -548,8 +560,11 @@ void BindImperative(py::module *m_ptr) {
           const std::string &ipc_name = shared_writer_holder->ipc_name();
           memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
           // 5. copy data & reset holder
-          memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                       platform::CPUPlace(), data_ptr, data_size);
+          memory::Copy(platform::CPUPlace(),
+                       shared_writer_holder->ptr(),
+                       platform::CPUPlace(),
+                       data_ptr,
+                       data_size);
           t.ResetHolder(shared_writer_holder);
           // 6. append to result list
           tensors.append(t);
@@ -564,7 +579,8 @@ void BindImperative(py::module *m_ptr) {
         // 1. cast to python array
         auto array = obj.cast<py::array>();
         PADDLE_ENFORCE_NE(
-            string::Sprintf("%s", array.dtype()).compare("object"), 0,
+            string::Sprintf("%s", array.dtype()).compare("object"),
+            0,
             platform::errors::InvalidArgument(
                 "Faild to convert input data to a regular ndarray.\n  * "
                 "Usually this means the input data contains nested "
@@ -573,8 +589,8 @@ void BindImperative(py::module *m_ptr) {
                 "_generator' to locate the data causes this issue."));
         // 2. construcct LoDTensor
         framework::LoDTensor t;
-        SetTensorFromPyArray<platform::CPUPlace>(&t, array,
-                                                 platform::CPUPlace(), true);
+        SetTensorFromPyArray<platform::CPUPlace>(
+            &t, array, platform::CPUPlace(), true);
         // 3. allocate shared memory
         void *data_ptr = t.data();
         size_t data_size = t.numel() * framework::DataTypeSize(t.dtype());
@@ -584,8 +600,11 @@ void BindImperative(py::module *m_ptr) {
         const std::string &ipc_name = shared_writer_holder->ipc_name();
         memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
         // 5. copy data & reset holder
-        memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                     platform::CPUPlace(), data_ptr, data_size);
+        memory::Copy(platform::CPUPlace(),
+                     shared_writer_holder->ptr(),
+                     platform::CPUPlace(),
+                     data_ptr,
+                     data_size);
         t.ResetHolder(shared_writer_holder);
 
         return t;
@@ -640,9 +659,12 @@ void BindImperative(py::module *m_ptr) {
              new (&self) imperative::VarBase(name);
            })
       .def("__init__",
-           [](imperative::VarBase &self, framework::proto::VarType::Type dtype,
-              const std::vector<int> &dims, const py::handle &name,
-              framework::proto::VarType::Type type, bool persistable) {
+           [](imperative::VarBase &self,
+              framework::proto::VarType::Type dtype,
+              const std::vector<int> &dims,
+              const py::handle &name,
+              framework::proto::VarType::Type type,
+              bool persistable) {
              VLOG(4) << "Init VarBase";
              std::string act_name = "";
              if (!name.ptr() || name.ptr() == Py_None) {
@@ -661,55 +683,107 @@ void BindImperative(py::module *m_ptr) {
                tensor->Resize(phi::make_ddim(dims));
              }
            })
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CPUPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::CPUPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::XPUPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::CUDAPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CUDAPinnedPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::CUDAPinnedPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::NPUPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::NPUPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::MLUPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::MLUPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
-      .def("__init__", &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
-           py::arg("value"), py::arg("place"), py::arg("persistable") = false,
-           py::arg("zero_copy") = false, py::arg("name") = "",
+      .def("__init__",
+           &InitVarBaseFromNumpyWithArg<platform::CustomPlace>,
+           py::arg("value"),
+           py::arg("place"),
+           py::arg("persistable") = false,
+           py::arg("zero_copy") = false,
+           py::arg("name") = "",
            py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
-      .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"),
+      .def("__init__",
+           &InitVarBaseFromTensorWithArgDefault,
+           py::arg("tensor"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::CPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::XPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::CUDAPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::CUDAPinnedPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::NPUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("name") = "")
+      .def("__init__",
+           &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
            py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::XPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPinnedPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::NPUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::MLUPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
-      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CustomPlace>,
-           py::arg("tensor"), py::arg("place"), py::arg("name") = "")
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
       .def(
           "__setitem_varbase__",
-          [](std::shared_ptr<imperative::VarBase> &self, py::handle _index,
+          [](std::shared_ptr<imperative::VarBase> &self,
+             py::handle _index,
              py::object &value_obj) {
             VLOG(4) << "Call __setitem_varbase__";
 
@@ -767,9 +841,16 @@ void BindImperative(py::module *m_ptr) {
                   none_axes, infer_flags, list_select_idxs;
               // if index is a list, list_select_flag will be true
               bool list_select_flag = false;
-              ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends,
-                                 &steps, &decrease_axes, &none_axes,
-                                 &infer_flags, &list_select_idxs,
+              ParseIndexingSlice(self_tensor,
+                                 index_ptr,
+                                 &axes,
+                                 &starts,
+                                 &ends,
+                                 &steps,
+                                 &decrease_axes,
+                                 &none_axes,
+                                 &infer_flags,
+                                 &list_select_idxs,
                                  &list_select_flag);
 
               framework::AttributeMap attrs = {{"axes", axes},
@@ -786,7 +867,8 @@ void BindImperative(py::module *m_ptr) {
 
               if (tracer->HasGrad()) {
                 PADDLE_ENFORCE_EQ(
-                    self->IsLeaf() && !self->OverridedStopGradient(), false,
+                    self->IsLeaf() && !self->OverridedStopGradient(),
+                    false,
                     platform::errors::InvalidArgument(
                         "Leaf Tensor (%s) that doesn't stop gradient can't use "
                         "inplace strategy.",
@@ -844,7 +926,9 @@ void BindImperative(py::module *m_ptr) {
 
                 SetTensorFromPyArray(value_tensor->MutableVar()
                                          ->GetMutable<framework::LoDTensor>(),
-                                     value, self->Place(), false);
+                                     value,
+                                     self->Place(),
+                                     false);
                 ins.insert({"ValueTensor", {value_tensor}});
 
               } else {
@@ -892,7 +976,10 @@ void BindImperative(py::module *m_ptr) {
               {
                 // Release gil and do tracing
                 py::gil_scoped_release release;
-                tracer->TraceOp("set_value", ins, outs, std::move(attrs),
+                tracer->TraceOp("set_value",
+                                ins,
+                                outs,
+                                std::move(attrs),
                                 {{"Input", "Out"}});
               }
             } else {
@@ -910,8 +997,8 @@ void BindImperative(py::module *m_ptr) {
                 VLOG(4) << "index is not tensor";
                 self_numpy[_index] = value_obj;
               }
-              SetTensorFromPyArray(self_tensor, self_numpy,
-                                   self_tensor->place(), false);
+              SetTensorFromPyArray(
+                  self_tensor, self_numpy, self_tensor->place(), false);
             }
           })
       .def("_getitem_index_not_tensor",
@@ -924,10 +1011,17 @@ void BindImperative(py::module *m_ptr) {
              bool list_select_flag = false;
              auto tensor =
                  self->MutableVar()->GetMutable<framework::LoDTensor>();
-             ParseIndexingSlice(tensor, _index.ptr(), &slice_axes,
-                                &slice_starts, &slice_ends, &slice_strides,
-                                &decrease_axis, &none_axes, &infer_flags,
-                                &list_select_idxs, &list_select_flag);
+             ParseIndexingSlice(tensor,
+                                _index.ptr(),
+                                &slice_axes,
+                                &slice_starts,
+                                &slice_ends,
+                                &slice_strides,
+                                &decrease_axis,
+                                &none_axes,
+                                &infer_flags,
+                                &list_select_idxs,
+                                &list_select_flag);
              // release gil and do tracing
              py::gil_scoped_release release;
              const auto &tracer = imperative::GetCurrentTracer();
@@ -1008,8 +1102,8 @@ void BindImperative(py::module *m_ptr) {
                                       ->GetMutable<framework::LoDTensor>();
                auto *dev_ctx = platform::DeviceContextPool::Instance().Get(
                    tracer->ExpectedPlace());
-               paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
-                                                   idx_tensor);
+               paddle::framework::TensorFromVector(
+                   list_select_idxs, *dev_ctx, idx_tensor);
 
                imperative::NameVarBaseMap ins = {{"X", {self}},
                                                  {"Index", {select_index}}};
@@ -1024,7 +1118,8 @@ void BindImperative(py::module *m_ptr) {
           [](std::shared_ptr<imperative::VarBase> &self, const py::args &args) {
             const auto &tensor = self->Var().Get<framework::LoDTensor>();
             PADDLE_ENFORCE_EQ(
-                tensor.IsInitialized(), true,
+                tensor.IsInitialized(),
+                true,
                 platform::errors::InvalidArgument(
                     "Tensor of %s is Empty, please check if it has no data.",
                     self->Name()));
@@ -1043,28 +1138,34 @@ void BindImperative(py::module *m_ptr) {
             size_t offset = 0;
             if (args.empty()) {
               PADDLE_ENFORCE_EQ(
-                  numel, 1,
+                  numel,
+                  1,
                   platform::errors::InvalidArgument(
                       "only one element tensors can be converted to Python "
                       "scalars when no input coordinates"));
             } else if (args.size() == 1) {
               offset = args[0].cast<size_t>();
               PADDLE_ENFORCE_LT(
-                  offset, numel,
+                  offset,
+                  numel,
                   platform::errors::InvalidArgument(
                       "index %d is out of bounds for size %d", offset, numel));
             } else {
-              PADDLE_ENFORCE_EQ(args.size(), dims.size(),
+              PADDLE_ENFORCE_EQ(args.size(),
+                                dims.size(),
                                 platform::errors::InvalidArgument(
                                     "incorrect number of indices for Tensor"));
 
               for (size_t i = 0; i < args.size(); ++i) {
                 size_t index = args[i].cast<size_t>();
                 PADDLE_ENFORCE_LT(
-                    index, dims[i],
+                    index,
+                    dims[i],
                     platform::errors::InvalidArgument(
                         "index %d is out fo bounds for axis %d with size %d",
-                        index, i, dims[i]));
+                        index,
+                        i,
+                        dims[i]));
                 offset += index * strides[i];
               }
             }
@@ -1072,8 +1173,8 @@ void BindImperative(py::module *m_ptr) {
   if (framework::TransToProtoVarType(tensor.dtype()) == proto_type) {        \
     std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \
     T b = TensorGetElement<T>(tensor, offset);                               \
-    return py::array(py::dtype(py_dtype_str.c_str()), {}, {},                \
-                     static_cast<void *>(&b));                               \
+    return py::array(                                                        \
+        py::dtype(py_dtype_str.c_str()), {}, {}, static_cast<void *>(&b));   \
   }
 
             _ForEachDataType_(TENSOR_TO_PY_SCALAR);
@@ -1086,7 +1187,8 @@ void BindImperative(py::module *m_ptr) {
            [](imperative::VarBase &self) -> uint32_t {
              const auto &var = self.MutableVar();
              PADDLE_ENFORCE_EQ(
-                 var->IsInitialized(), true,
+                 var->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor of %s is Empty, please check if it has no data.",
                      self.Name()));
@@ -1111,7 +1213,8 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::VarBase &self) -> py::array {
             const auto &tensor = self.MutableVar()->Get<framework::LoDTensor>();
             PADDLE_ENFORCE_EQ(
-                tensor.IsInitialized(), true,
+                tensor.IsInitialized(),
+                true,
                 platform::errors::InvalidArgument(
                     "Tensor of %s is Empty, please check if it has no data.",
                     self.Name()));
@@ -1142,7 +1245,8 @@ void BindImperative(py::module *m_ptr) {
           [](const imperative::VarBase &self)
               -> std::shared_ptr<imperative::VarBase> {
             PADDLE_ENFORCE_EQ(
-                self.Var().IsInitialized(), true,
+                self.Var().IsInitialized(),
+                true,
                 platform::errors::InvalidArgument(
                     "Tensor %s has not been initialized!", self.Name()));
 
@@ -1165,7 +1269,8 @@ void BindImperative(py::module *m_ptr) {
               const auto &origin_tensor =
                   self.Var().Get<framework::LoDTensor>();
               PADDLE_ENFORCE_EQ(
-                  origin_tensor.IsInitialized(), true,
+                  origin_tensor.IsInitialized(),
+                  true,
                   platform::errors::InvalidArgument(
                       "Tensor %s has not been initialized!", self.Name()));
 
@@ -1181,7 +1286,8 @@ void BindImperative(py::module *m_ptr) {
               const auto &origin_selected_rows =
                   self.Var().Get<phi::SelectedRows>();
               PADDLE_ENFORCE_EQ(
-                  origin_selected_rows.value().IsInitialized(), true,
+                  origin_selected_rows.value().IsInitialized(),
+                  true,
                   platform::errors::InvalidArgument(
                       "Tensor %s has not been initialized!", self.Name()));
 
@@ -1199,7 +1305,8 @@ void BindImperative(py::module *m_ptr) {
                     << ") share data with " << self.Name();
             return detach_var;
           },
-          py::return_value_policy::take_ownership, R"DOC(
+          py::return_value_policy::take_ownership,
+          R"DOC(
 
         Returns a new Tensor, detached from the current graph.
         It will share data with origin Tensor and always doesn't have a Tensor copy.
@@ -1237,8 +1344,10 @@ void BindImperative(py::module *m_ptr) {
                 #   one of the variables needed for gradient computation has been modified by an inplace operation.
              
        )DOC")
-      .def("clear_gradient", &imperative::VarBase::ClearGradient,
-           py::arg("set_to_zero") = true, R"DOC(
+      .def("clear_gradient",
+           &imperative::VarBase::ClearGradient,
+           py::arg("set_to_zero") = true,
+           R"DOC(
 
         Only for Tensor that has gradient, normally we use this for Parameters since other temporary Tensor doesen't has gradient.
 
@@ -1258,14 +1367,16 @@ void BindImperative(py::module *m_ptr) {
                 linear.weight.clear_gradient()
                 print("After clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
       )DOC")
-      .def("_gradient_set_empty", &imperative::VarBase::_GradientSetEmpty,
+      .def("_gradient_set_empty",
+           &imperative::VarBase::_GradientSetEmpty,
            py::arg("set_is_empty") = true)
       .def("_is_gradient_set_empty", &imperative::VarBase::_IsGradientSetEmpty)
       .def(
           "clone",
           [](std::shared_ptr<imperative::VarBase> &self) {
             const auto &tensor = self->Var().Get<framework::LoDTensor>();
-            PADDLE_ENFORCE_EQ(tensor.IsInitialized(), true,
+            PADDLE_ENFORCE_EQ(tensor.IsInitialized(),
+                              true,
                               platform::errors::InvalidArgument(
                                   "%s has not been initialized", self->Name()));
             auto tracer = imperative::GetCurrentTracer();
@@ -1277,7 +1388,8 @@ void BindImperative(py::module *m_ptr) {
             tracer->TraceOp("assign", ins, outs, attrs);
             return new_var;
           },
-          py::return_value_policy::copy, R"DOC(
+          py::return_value_policy::copy,
+          R"DOC(
 
         Returns a new Tensor, which is clone of origin Tensor, and it remains in the current graph.
         It will always have a Tensor copy.
@@ -1402,7 +1514,8 @@ void BindImperative(py::module *m_ptr) {
       .def("_register_grad_hook",
            [](imperative::VarBase &self, const py::handle &hook) {
              PADDLE_ENFORCE_EQ(
-                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Cannot register gradient hook on a Tensor that stop "
                      "gradient or without gradient."));
@@ -1412,7 +1525,8 @@ void BindImperative(py::module *m_ptr) {
       .def("_remove_grad_hook",
            [](imperative::VarBase &self, int64_t hook_id) {
              PADDLE_ENFORCE_EQ(
-                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Cannot remove gradient hook on a Tensor that stop "
                      "gradient or without gradient."));
@@ -1421,7 +1535,8 @@ void BindImperative(py::module *m_ptr) {
       .def("_register_void_function_post_hook",
            [](imperative::VarBase &self, const py::handle &hook) {
              PADDLE_ENFORCE_EQ(
-                 !self.OverridedStopGradient() && self.HasGradVar(), true,
+                 !self.OverridedStopGradient() && self.HasGradVar(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Cannot register void function post hook on a Tensor that "
                      "stop "
@@ -1437,11 +1552,13 @@ void BindImperative(py::module *m_ptr) {
           "_register_backward_hook",
           [](imperative::VarBase &self, const py::handle &hook) {
             PADDLE_ENFORCE_EQ(
-                self.IsLeaf(), true,
+                self.IsLeaf(),
+                true,
                 platform::errors::InvalidArgument(
                     "Only can register backward hook for leaf Tensor."));
             PADDLE_ENFORCE_EQ(
-                !self.OverridedStopGradient() && self.HasGradVar(), true,
+                !self.OverridedStopGradient() && self.HasGradVar(),
+                true,
                 platform::errors::InvalidArgument(
                     "Cannot register backward hook on a Tensor that stop "
                     "gradient or without gradient."));
@@ -1534,7 +1651,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "cuda",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             py::handle &handle, bool blocking) {
+             py::handle &handle,
+             bool blocking) {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
@@ -1576,7 +1694,9 @@ void BindImperative(py::module *m_ptr) {
             }
 #endif
           },
-          py::arg("device_id") = py::none(), py::arg("blocking") = true, R"DOC(
+          py::arg("device_id") = py::none(),
+          py::arg("blocking") = true,
+          R"DOC(
         Returns a copy of this Tensor in GPU memory.
 
         If this Tensor is already in GPU memory and device_id is default, 
@@ -1610,7 +1730,8 @@ void BindImperative(py::module *m_ptr) {
           [](const std::shared_ptr<imperative::VarBase> &self) {
 #ifndef _WIN32
             PADDLE_ENFORCE_EQ(
-                platform::is_cpu_place(self->Place()), true,
+                platform::is_cpu_place(self->Place()),
+                true,
                 platform::errors::InvalidArgument(
                     "Sharing memory only support CPU Tensor currently"));
             // 1. get LoDTensor
@@ -1627,8 +1748,11 @@ void BindImperative(py::module *m_ptr) {
             const std::string &ipc_name = shared_writer_holder->ipc_name();
             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
             // 4. copy data & reset holder
-            memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(),
-                         platform::CPUPlace(), data_ptr, data_size);
+            memory::Copy(platform::CPUPlace(),
+                         shared_writer_holder->ptr(),
+                         platform::CPUPlace(),
+                         data_ptr,
+                         data_size);
             t->ResetHolder(shared_writer_holder);
             return *t;
 #else
@@ -1641,7 +1765,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_uva",
           [](const std::shared_ptr<imperative::VarBase> &self, int device_id) {
-            PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()), true,
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(self->Place()),
+                              true,
                               platform::errors::InvalidArgument(
                                   "Unified virtual addressing only support "
                                   "CPU Tensor currently."));
@@ -1649,7 +1774,9 @@ void BindImperative(py::module *m_ptr) {
                 self->MutableVar()->GetMutable<framework::LoDTensor>();
             tensor_uva(self_tensor, device_id);
           },
-          py::arg("device_id") = 0, py::return_value_policy::reference, R"DOC(
+          py::arg("device_id") = 0,
+          py::return_value_policy::reference,
+          R"DOC(
         Returns self tensor with the UVA(unified virtual addressing).
 
         Args:
@@ -1669,7 +1796,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::CPUPlace &place, bool blocking) {
+             const platform::CPUPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
             // copy data from the tensor of self to the tensor of new varbase,
@@ -1688,7 +1816,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::CUDAPinnedPlace &place, bool blocking) {
+             const platform::CUDAPinnedPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1699,7 +1828,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::XPUPlace &place, bool blocking) {
+             const platform::XPUPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1710,7 +1840,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::CUDAPlace &place, bool blocking) {
+             const platform::CUDAPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1721,7 +1852,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::NPUPlace &place, bool blocking) {
+             const platform::NPUPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1732,7 +1864,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::IPUPlace &place, bool blocking) {
+             const platform::IPUPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1743,7 +1876,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::MLUPlace &place, bool blocking) {
+             const platform::MLUPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1754,7 +1888,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::CustomPlace &place, bool blocking) {
+             const platform::CustomPlace &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1765,7 +1900,8 @@ void BindImperative(py::module *m_ptr) {
       .def(
           "_copy_to",
           [](const std::shared_ptr<imperative::VarBase> &self,
-             const platform::Place &place, bool blocking) {
+             const platform::Place &place,
+             bool blocking) {
             auto new_var = self->NewVarBase(place, blocking);
             if (!blocking) {
               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
@@ -1774,13 +1910,15 @@ void BindImperative(py::module *m_ptr) {
           },
           py::return_value_policy::copy)
       .def(
-          "value", [](imperative::VarBase &self) { return self.MutableVar(); },
+          "value",
+          [](imperative::VarBase &self) { return self.MutableVar(); },
           py::return_value_policy::reference)
       .def("_clear",
            [](const std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              PADDLE_ENFORCE_EQ(
-                 t->IsInitialized(), true,
+                 t->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              t->clear();
@@ -1789,7 +1927,8 @@ void BindImperative(py::module *m_ptr) {
            [](const std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              PADDLE_ENFORCE_EQ(
-                 t->IsInitialized(), true,
+                 t->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              return t->offset();
@@ -1800,7 +1939,8 @@ void BindImperative(py::module *m_ptr) {
              auto *src = self->MutableVar()->GetMutable<framework::LoDTensor>();
              auto *dst_ = dst->MutableVar()->GetMutable<framework::LoDTensor>();
              PADDLE_ENFORCE_EQ(
-                 src->IsInitialized(), true,
+                 src->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              dst_->ShareBufferWith(*src);
@@ -1822,7 +1962,8 @@ void BindImperative(py::module *m_ptr) {
              auto *src = self->MutableVar()->GetMutable<framework::LoDTensor>();
              auto *dst_ = dst->MutableVar()->GetMutable<framework::LoDTensor>();
              PADDLE_ENFORCE_EQ(
-                 src->IsInitialized(), true,
+                 src->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              dst_->ShareBufferWith(*src);
@@ -1841,10 +1982,12 @@ void BindImperative(py::module *m_ptr) {
            })
       .def("_slice",
            [](const std::shared_ptr<imperative::VarBase> &self,
-              int64_t begin_idx, int64_t end_idx) {
+              int64_t begin_idx,
+              int64_t end_idx) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
              PADDLE_ENFORCE_EQ(
-                 t->IsInitialized(), true,
+                 t->IsInitialized(),
+                 true,
                  platform::errors::InvalidArgument(
                      "Tensor %s has not been initialized!", self->Name()));
              return t->Slice(begin_idx, end_idx);
@@ -1880,12 +2023,13 @@ void BindImperative(py::module *m_ptr) {
             x = paddle.to_tensor(1, dtype='complex128')
             x.element_size() # 16
        )DOC")
-      .def_property("name", &imperative::VarBase::Name,
-                    &imperative::VarBase::SetName)
+      .def_property(
+          "name", &imperative::VarBase::Name, &imperative::VarBase::SetName)
       .def_property("stop_gradient",
                     &imperative::VarBase::OverridedStopGradient,
                     &imperative::VarBase::SetOverridedStopGradient)
-      .def_property("persistable", &imperative::VarBase::Persistable,
+      .def_property("persistable",
+                    &imperative::VarBase::Persistable,
                     &imperative::VarBase::SetPersistable)
       .def_property_readonly(
           "shape",
@@ -1909,7 +2053,8 @@ void BindImperative(py::module *m_ptr) {
               return std::vector<int>();
             }
           })
-      .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf,
+      .def_property_readonly("is_leaf",
+                             &imperative::VarBase::IsLeaf,
                              R"DOC(
       Whether a Tensor is leaf Tensor.
 
@@ -1939,7 +2084,8 @@ void BindImperative(py::module *m_ptr) {
               print(y.is_leaf) # False
        )DOC")
       .def_property_readonly(
-          "place", [](imperative::VarBase &self) { return self.Place(); },
+          "place",
+          [](imperative::VarBase &self) { return self.Place(); },
           py::return_value_policy::copy)
       .def_property_readonly("_place_str",
                              [](imperative::VarBase &self) {
@@ -1969,11 +2115,14 @@ void BindImperative(py::module *m_ptr) {
       .def_property("_enable_program_desc_tracing",
                     &imperative::Tracer::IsProgramDescTracingEnabled,
                     &imperative::Tracer::SetEnableProgramDescTracing)
-      .def_property("_amp_level", &imperative::Tracer::GetAmpLevel,
+      .def_property("_amp_level",
+                    &imperative::Tracer::GetAmpLevel,
                     &imperative::Tracer::SetAmpLevel)
-      .def_property("_amp_dtype", &imperative::Tracer::GetAmpDtype,
+      .def_property("_amp_dtype",
+                    &imperative::Tracer::GetAmpDtype,
                     &imperative::Tracer::SetAmpDtype)
-      .def_property("_has_grad", &imperative::Tracer::HasGrad,
+      .def_property("_has_grad",
+                    &imperative::Tracer::HasGrad,
                     &imperative::Tracer::SetHasGrad)
       .def_property(
           "_expected_place",
@@ -2039,7 +2188,8 @@ void BindImperative(py::module *m_ptr) {
       .def("_get_program_desc_tracer",
            &imperative::Tracer::GetProgramDescTracer,
            py::return_value_policy::reference)
-      .def("_generate_unique_name", &imperative::Tracer::GenerateUniqueName,
+      .def("_generate_unique_name",
+           &imperative::Tracer::GenerateUniqueName,
            py::arg("key") = "dygraph_tmp")
       .def("_set_amp_op_list",
            [](imperative::Tracer &self,
@@ -2067,8 +2217,10 @@ void BindImperative(py::module *m_ptr) {
                  *(imperative::AmpOperators::Instance().GetMutableBlockOps()));
            })
       .def("_get_kernel_signature",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
               framework::AttributeMap attrs) {
              // TODO(xiongkun): move this function outside of tracer.
              auto ins_map = ConvertToNameTensorMap(ins);
@@ -2086,118 +2238,167 @@ void BindImperative(py::module *m_ptr) {
                    [](paddle::small_vector<const char *> &vec) {
                      return std::vector<std::string>(vec.begin(), vec.end());
                    };
-               auto ret = self.GetExpectedKernelSignature(type, ins_map,
-                                                          outs_map, attrs);
+               auto ret = self.GetExpectedKernelSignature(
+                   type, ins_map, outs_map, attrs);
                auto kernelsig_ins = input_to_vector(ret.input_names);
                auto kernelsig_attrs = attr_to_vector(ret.attr_names);
                auto kernelsig_outs = output_to_vector(ret.output_names);
-               return std::make_tuple(kernelsig_ins, kernelsig_attrs,
-                                      kernelsig_outs);
+               return std::make_tuple(
+                   kernelsig_ins, kernelsig_attrs, kernelsig_outs);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::CustomPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::CustomPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::XPUPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::XPUPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::CUDAPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::CUDAPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::NPUPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::NPUPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::IPUPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::IPUPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::MLUPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::MLUPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            })
       .def("trace",
-           [](imperative::Tracer &self, const std::string &type,
-              const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs,
-              framework::AttributeMap attrs, const platform::CPUPlace &place,
+           [](imperative::Tracer &self,
+              const std::string &type,
+              const PyNameVarBaseMap &ins,
+              const PyNameVarBaseMap &outs,
+              framework::AttributeMap attrs,
+              const platform::CPUPlace &place,
               bool trace_backward,
               const std::map<std::string, std::string> &inplace_map = {}) {
              auto ins_map = ConvertToNameVarBaseMap(ins);
              auto outs_map = ConvertToNameVarBaseMap(outs);
              {
                py::gil_scoped_release release;
-               self.TraceOp<imperative::VarBase>(
-                   type, std::move(ins_map), std::move(outs_map),
-                   std::move(attrs), place, trace_backward, inplace_map);
+               self.TraceOp<imperative::VarBase>(type,
+                                                 std::move(ins_map),
+                                                 std::move(outs_map),
+                                                 std::move(attrs),
+                                                 place,
+                                                 trace_backward,
+                                                 inplace_map);
              }
            });
 
@@ -2258,11 +2459,20 @@ void BindImperative(py::module *m_ptr) {
              &output_targets,
          const std::vector<std::shared_ptr<imperative::VarBase>> &output_grads,
          const std::vector<std::shared_ptr<imperative::VarBase>> &no_grad_vars,
-         const platform::Place &place, bool create_graph, bool retain_graph,
-         bool allow_unused, bool only_inputs) {
-        imperative::PartialGradEngine engine(
-            input_targets, output_targets, output_grads, no_grad_vars, place,
-            create_graph, retain_graph, allow_unused, only_inputs);
+         const platform::Place &place,
+         bool create_graph,
+         bool retain_graph,
+         bool allow_unused,
+         bool only_inputs) {
+        imperative::PartialGradEngine engine(input_targets,
+                                             output_targets,
+                                             output_grads,
+                                             no_grad_vars,
+                                             place,
+                                             create_graph,
+                                             retain_graph,
+                                             allow_unused,
+                                             only_inputs);
         engine.Execute();
         return engine.GetResult();
       },
@@ -2272,7 +2482,8 @@ void BindImperative(py::module *m_ptr) {
       "dygraph_run_backward",
       [](const std::vector<std::shared_ptr<imperative::VarBase>> &tensors,
          const std::vector<std::shared_ptr<imperative::VarBase>> &grad_tensors,
-         bool retain_graph, const imperative::Tracer &tracer) {
+         bool retain_graph,
+         const imperative::Tracer &tracer) {
         auto *engine = tracer.GetEngine();
         engine->Init(tensors, grad_tensors, retain_graph);
         VLOG(3) << "Start backward";
@@ -2294,11 +2505,16 @@ void BindImperative(py::module *m_ptr) {
                     const std::vector<std::vector<size_t>> &,
                     const std::vector<bool> &,
                     std::shared_ptr<imperative::ParallelContext>,
-                    const std::vector<size_t> &, bool>())
-      .def("prepare_for_backward", &imperative::Reducer::PrepareForBackward,
-           py::arg("vars"), py::call_guard<py::gil_scoped_release>());
-
-  m.def("assign_group_by_size", &imperative::AssignGroupBySize, py::arg("vars"),
+                    const std::vector<size_t> &,
+                    bool>())
+      .def("prepare_for_backward",
+           &imperative::Reducer::PrepareForBackward,
+           py::arg("vars"),
+           py::call_guard<py::gil_scoped_release>());
+
+  m.def("assign_group_by_size",
+        &imperative::AssignGroupBySize,
+        py::arg("vars"),
         py::arg("is_sparse_gradient"),
         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
         py::arg("tensor_indices") = std::vector<int64_t>{},
@@ -2306,7 +2522,8 @@ void BindImperative(py::module *m_ptr) {
 #endif
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  py::class_<imperative::NCCLParallelContext, imperative::ParallelContext,
+  py::class_<imperative::NCCLParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::NCCLParallelContext>>(
       m, "NCCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
@@ -2318,7 +2535,8 @@ void BindImperative(py::module *m_ptr) {
 #endif
 
 #if defined(PADDLE_WITH_XPU_BKCL)
-  py::class_<imperative::BKCLParallelContext, imperative::ParallelContext,
+  py::class_<imperative::BKCLParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::BKCLParallelContext>>(
       m, "BKCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
@@ -2331,7 +2549,8 @@ void BindImperative(py::module *m_ptr) {
 
 #if defined(PADDLE_WITH_GLOO)
   // xiongkun
-  py::class_<imperative::GLOOParallelContext, imperative::ParallelContext,
+  py::class_<imperative::GLOOParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::GLOOParallelContext>>(
       m, "GLOOParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
@@ -2343,7 +2562,8 @@ void BindImperative(py::module *m_ptr) {
 #endif
 
 #if defined(PADDLE_WITH_ASCEND_CL)
-  py::class_<imperative::HCCLParallelContext, imperative::ParallelContext,
+  py::class_<imperative::HCCLParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::HCCLParallelContext>>(
       m, "HCCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
@@ -2355,7 +2575,8 @@ void BindImperative(py::module *m_ptr) {
 #endif
 
 #if defined(PADDLE_WITH_CNCL)
-  py::class_<imperative::CNCLParallelContext, imperative::ParallelContext,
+  py::class_<imperative::CNCLParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::CNCLParallelContext>>(
       m, "CNCLParallelContext")
       .def(py::init<const imperative::ParallelStrategy &,
@@ -2368,7 +2589,8 @@ void BindImperative(py::module *m_ptr) {
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL)
-  py::class_<imperative::HeterParallelContext, imperative::ParallelContext,
+  py::class_<imperative::HeterParallelContext,
+             imperative::ParallelContext,
              std::shared_ptr<imperative::HeterParallelContext>>(
       m, "HeterParallelContext")
       .def(py::init<const imperative::ParallelStrategy &, const int &>())
@@ -2376,42 +2598,56 @@ void BindImperative(py::module *m_ptr) {
 #endif
 
   m.def("pylayer_apply",
-        [](const platform::CPUPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::CPUPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
 
   m.def("pylayer_apply",
-        [](const platform::CUDAPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::CUDAPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
 
   m.def("pylayer_apply",
-        [](const platform::XPUPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::XPUPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
 
   m.def("pylayer_apply",
-        [](const platform::CUDAPinnedPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::CUDAPinnedPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
 
   m.def("pylayer_apply",
-        [](const platform::NPUPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::NPUPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
   m.def("pylayer_apply",
-        [](const platform::MLUPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::MLUPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
   m.def("pylayer_apply",
-        [](const platform::CustomPlace &place, const py::object &cls,
-           const py::args args, const py::kwargs kwargs) {
+        [](const platform::CustomPlace &place,
+           const py::object &cls,
+           const py::args args,
+           const py::kwargs kwargs) {
           return imperative::PyLayerApply(place, cls, args, kwargs);
         });
 
@@ -2437,8 +2673,8 @@ void BindImperative(py::module *m_ptr) {
           SetUVATensorFromPyArray<int16_t>(new_tensor, array, device_id);
         } else if (py::isinstance<py::array_t<paddle::platform::float16>>(
                        array)) {
-          SetUVATensorFromPyArray<paddle::platform::float16>(new_tensor, array,
-                                                             device_id);
+          SetUVATensorFromPyArray<paddle::platform::float16>(
+              new_tensor, array, device_id);
         } else if (py::isinstance<py::array_t<bool>>(array)) {
           SetUVATensorFromPyArray<bool>(new_tensor, array, device_id);
         } else {
@@ -2452,8 +2688,10 @@ void BindImperative(py::module *m_ptr) {
         }
         return new_tensor;
       },
-      py::arg("obj"), py::arg("device_id") = 0,
-      py::return_value_policy::reference, R"DOC(
+      py::arg("obj"),
+      py::arg("device_id") = 0,
+      py::return_value_policy::reference,
+      R"DOC(
   Returns tensor with the UVA(unified virtual addressing) created from numpy array.
 
   Args:
@@ -2485,26 +2723,32 @@ void BindImperative(py::module *m_ptr) {
 #if defined(PADDLE_WITH_CUDA)
   m.def(
       "async_write",
-      [](const imperative::VarBase &src, imperative::VarBase &dst,
-         const imperative::VarBase &offset, const imperative::VarBase &count) {
+      [](const imperative::VarBase &src,
+         imperative::VarBase &dst,
+         const imperative::VarBase &offset,
+         const imperative::VarBase &count) {
         PADDLE_ENFORCE_EQ(
-            platform::is_gpu_place(src.Place()), true,
+            platform::is_gpu_place(src.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `src` device should be CUDAPlace, but received %d. ",
                 src.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cuda_pinned_place(dst.Place()), true,
+            platform::is_cuda_pinned_place(dst.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `dst` device should be CUDAPinnedPlace, "
                 "but received %d. ",
                 dst.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cpu_place(offset.Place()), true,
+            platform::is_cpu_place(offset.Place()),
+            true,
             platform::errors::InvalidArgument("Required `offset` device should "
                                               "be CPUPlace, but received %d. ",
                                               offset.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cpu_place(count.Place()), true,
+            platform::is_cpu_place(count.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `count` device should be CPUPlace, but received %d. ",
                 count.Place()));
@@ -2517,23 +2761,28 @@ void BindImperative(py::module *m_ptr) {
         auto &count_tensor = count.Var().Get<framework::LoDTensor>();
         const auto &deviceId = paddle::platform::GetCurrentDeviceId();
 
-        PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1,
+        PADDLE_ENFORCE_EQ(offset_tensor.dims().size(),
+                          1,
                           platform::errors::InvalidArgument(
                               "`offset` tensor should be one-dimensional."));
-        PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1,
+        PADDLE_ENFORCE_EQ(count_tensor.dims().size(),
+                          1,
                           platform::errors::InvalidArgument(
                               "`count` tensor should be one-dimensional."));
-        PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(),
+        PADDLE_ENFORCE_EQ(offset_tensor.numel(),
+                          count_tensor.numel(),
                           platform::errors::InvalidArgument(
                               "`offset` and `count` tensor size dismatch."));
         PADDLE_ENFORCE_EQ(
-            src_tensor.dims().size(), dst_tensor->dims().size(),
+            src_tensor.dims().size(),
+            dst_tensor->dims().size(),
             platform::errors::InvalidArgument(
                 "`src` and `dst` should have the same tensor shape, "
                 "except for the first dimension."));
         for (int i = 1; i < src_tensor.dims().size(); i++) {
           PADDLE_ENFORCE_EQ(
-              src_tensor.dims()[i], dst_tensor->dims()[i],
+              src_tensor.dims()[i],
+              dst_tensor->dims()[i],
               platform::errors::InvalidArgument(
                   "`src` and `dst` should have the same tensor shape, "
                   "except for the first dimension."));
@@ -2550,15 +2799,19 @@ void BindImperative(py::module *m_ptr) {
         int64_t src_offset = 0, dst_offset, c;
         for (int64_t i = 0; i < offset_tensor.numel(); i++) {
           dst_offset = offset_data[i], c = count_data[i];
-          PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0],
+          PADDLE_ENFORCE_LE(src_offset + c,
+                            src_tensor.dims()[0],
                             platform::errors::InvalidArgument(
                                 "Invalid offset or count index"));
-          PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0],
+          PADDLE_ENFORCE_LE(dst_offset + c,
+                            dst_tensor->dims()[0],
                             platform::errors::InvalidArgument(
                                 "Invalid offset or count index"));
-          cudaMemcpyAsync(
-              dst_data + (dst_offset * size), src_data + (src_offset * size),
-              c * size * sizeof(float), cudaMemcpyDeviceToHost, stream);
+          cudaMemcpyAsync(dst_data + (dst_offset * size),
+                          src_data + (src_offset * size),
+                          c * size * sizeof(float),
+                          cudaMemcpyDeviceToHost,
+                          stream);
           src_offset += c;
         }
       },
@@ -2615,37 +2868,46 @@ void BindImperative(py::module *m_ptr) {
 
   m.def(
       "async_read",
-      [](const imperative::VarBase &src, imperative::VarBase &dst,
-         const imperative::VarBase &index, imperative::VarBase &buffer,
-         const imperative::VarBase &offset, const imperative::VarBase &count) {
-        PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()), true,
+      [](const imperative::VarBase &src,
+         imperative::VarBase &dst,
+         const imperative::VarBase &index,
+         imperative::VarBase &buffer,
+         const imperative::VarBase &offset,
+         const imperative::VarBase &count) {
+        PADDLE_ENFORCE_EQ(platform::is_cuda_pinned_place(src.Place()),
+                          true,
                           platform::errors::InvalidArgument(
                               "Required `src` device should be "
                               "CUDAPinnedPlace, but received %d.",
                               src.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_gpu_place(dst.Place()), true,
+            platform::is_gpu_place(dst.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `dst` device should be CUDAPlace, but received %d.",
                 dst.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cpu_place(index.Place()), true,
+            platform::is_cpu_place(index.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `index` device should be CPUPlace, but received %d.",
                 index.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cuda_pinned_place(buffer.Place()), true,
+            platform::is_cuda_pinned_place(buffer.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `buffer` device should be CUDAPinnedPlace, "
                 "but received %d.",
                 buffer.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cpu_place(offset.Place()), true,
+            platform::is_cpu_place(offset.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `offset` device should be CPUPlace, but received %d.",
                 offset.Place()));
         PADDLE_ENFORCE_EQ(
-            platform::is_cpu_place(count.Place()), true,
+            platform::is_cpu_place(count.Place()),
+            true,
             platform::errors::InvalidArgument(
                 "Required `count` device should be CPUPlace, but received %d.",
                 count.Place()));
@@ -2660,28 +2922,33 @@ void BindImperative(py::module *m_ptr) {
         auto *dst_data = dst_tensor->mutable_data<float>(dst.Place());
         const auto &deviceId = paddle::platform::GetCurrentDeviceId();
 
-        PADDLE_ENFORCE_EQ(src_tensor.dims().size(), dst_tensor->dims().size(),
+        PADDLE_ENFORCE_EQ(src_tensor.dims().size(),
+                          dst_tensor->dims().size(),
                           platform::errors::InvalidArgument(
                               "`src` and `dst` should have same tensor shape, "
                               "except for the first dimension."));
         PADDLE_ENFORCE_EQ(
-            src_tensor.dims().size(), buffer_tensor->dims().size(),
+            src_tensor.dims().size(),
+            buffer_tensor->dims().size(),
             platform::errors::InvalidArgument(
                 "`src` and `buffer` should have same tensor shape, "
                 "except for the first dimension."));
         for (int i = 1; i < src_tensor.dims().size(); i++) {
           PADDLE_ENFORCE_EQ(
-              src_tensor.dims()[i], dst_tensor->dims()[i],
+              src_tensor.dims()[i],
+              dst_tensor->dims()[i],
               platform::errors::InvalidArgument(
                   "`src` and `dst` should have the same tensor shape, "
                   "except for the first dimension."));
           PADDLE_ENFORCE_EQ(
-              src_tensor.dims()[i], buffer_tensor->dims()[i],
+              src_tensor.dims()[i],
+              buffer_tensor->dims()[i],
               platform::errors::InvalidArgument(
                   "`src` and `buffer` should have the same tensor shape, "
                   "except for the first dimension."));
         }
-        PADDLE_ENFORCE_EQ(index_tensor.dims().size(), 1,
+        PADDLE_ENFORCE_EQ(index_tensor.dims().size(),
+                          1,
                           platform::errors::InvalidArgument(
                               "`index` tensor should be one-dimensional."));
 
@@ -2693,13 +2960,16 @@ void BindImperative(py::module *m_ptr) {
         int64_t size = src_tensor.numel() / src_tensor.dims()[0];
 
         if (copy_flag != 0) {
-          PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1,
+          PADDLE_ENFORCE_EQ(offset_tensor.dims().size(),
+                            1,
                             platform::errors::InvalidArgument(
                                 "`offset` tensor should be one-dimensional."));
-          PADDLE_ENFORCE_EQ(count_tensor.dims().size(), 1,
+          PADDLE_ENFORCE_EQ(count_tensor.dims().size(),
+                            1,
                             platform::errors::InvalidArgument(
                                 "`count` tensor should be one-dimensional."));
-          PADDLE_ENFORCE_EQ(offset_tensor.numel(), count_tensor.numel(),
+          PADDLE_ENFORCE_EQ(offset_tensor.numel(),
+                            count_tensor.numel(),
                             platform::errors::InvalidArgument(
                                 "`offset` and `count` tensor size dismatch."));
           auto *offset_data = offset_tensor.data<int64_t>();
@@ -2711,7 +2981,8 @@ void BindImperative(py::module *m_ptr) {
                             buffer_tensor->dims()[0],
                             platform::errors::InvalidArgument(
                                 "Buffer tensor size is too small."));
-          PADDLE_ENFORCE_LE(numel + index_tensor.numel(), dst_tensor->dims()[0],
+          PADDLE_ENFORCE_LE(numel + index_tensor.numel(),
+                            dst_tensor->dims()[0],
                             platform::errors::InvalidArgument(
                                 "Target tensor size is too small."));
 
@@ -2719,19 +2990,24 @@ void BindImperative(py::module *m_ptr) {
           auto *src_data = src_tensor.data<float>();
           for (int64_t i = 0; i < offset_tensor.numel(); i++) {
             src_offset = offset_data[i], c = count_data[i];
-            PADDLE_ENFORCE_LE(src_offset + c, src_tensor.dims()[0],
+            PADDLE_ENFORCE_LE(src_offset + c,
+                              src_tensor.dims()[0],
                               platform::errors::InvalidArgument(
                                   "Invalid offset or count index."));
-            PADDLE_ENFORCE_LE(dst_offset + c, dst_tensor->dims()[0],
+            PADDLE_ENFORCE_LE(dst_offset + c,
+                              dst_tensor->dims()[0],
                               platform::errors::InvalidArgument(
                                   "Invalid offset or count index."));
-            cudaMemcpyAsync(
-                dst_data + (dst_offset * size), src_data + (src_offset * size),
-                c * size * sizeof(float), cudaMemcpyHostToDevice, stream);
+            cudaMemcpyAsync(dst_data + (dst_offset * size),
+                            src_data + (src_offset * size),
+                            c * size * sizeof(float),
+                            cudaMemcpyHostToDevice,
+                            stream);
             dst_offset += c;
           }
         } else {
-          PADDLE_ENFORCE_LE(index_tensor.numel(), buffer_tensor->dims()[0],
+          PADDLE_ENFORCE_LE(index_tensor.numel(),
+                            buffer_tensor->dims()[0],
                             platform::errors::InvalidArgument(
                                 "Buffer tensor size is too small."));
         }
@@ -2749,16 +3025,19 @@ void BindImperative(py::module *m_ptr) {
           int64_t c = 0;
           for (int64_t i = 0; i < index_tensor.numel(); i++) {
             std::memcpy(buffer_data + c * slice_size,
-                        src_data + index_data[i] * slice_size, copy_bytes);
+                        src_data + index_data[i] * slice_size,
+                        copy_bytes);
             c += 1;
           }
         };
         index_select(src_tensor, index_tensor, buffer_tensor);
 
         // Copy the data to device memory
-        cudaMemcpyAsync(dst_data + (numel * size), buffer_tensor->data<float>(),
+        cudaMemcpyAsync(dst_data + (numel * size),
+                        buffer_tensor->data<float>(),
                         index_tensor.numel() * size * sizeof(float),
-                        cudaMemcpyHostToDevice, stream);
+                        cudaMemcpyHostToDevice,
+                        stream);
       },
       R"DOC(
   This api provides a way to read from pieces of source tensor to destination tensor