diff --git a/paddle/fluid/framework/ipu/compiler.cc b/paddle/fluid/framework/ipu/compiler.cc
index 0480adfb6d433..8de89293c1fcc 100644
--- a/paddle/fluid/framework/ipu/compiler.cc
+++ b/paddle/fluid/framework/ipu/compiler.cc
@@ -11,106 +11,194 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "paddle/fluid/framework/ipu/compiler.h"
 
+#include "paddle/fluid/framework/ipu/ipu_utils.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
 namespace paddle {
 namespace framework {
 namespace ipu {
 
-Compiler::Compiler(const IpuStrategy* ipu_strategy) {
-  ipu_strategy_ = ipu_strategy;
+template <typename T>
+T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
+  std::string type = typeid(T).name();
+  VLOG(10) << "body attr type is: " << type << " body attr name is: " << attr;
+  if (op_desc->HasAttr(attr)) {
+    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
+  } else {
+    VLOG(10) << "body attr not exist: " << type;
+    return {};
+  }
+}
+
+Compiler::Compiler() {
   builder_ = popart::Builder::create();
   RegisterOpFunc();
 }
 
-Compiler::~Compiler() {
-  builder_.release();
-  tensors_.clear();
-  ipu_strategy_ = nullptr;
-}
+Compiler::~Compiler() = default;
 
-void Compiler::InsertTensors(std::vector<std::string> output_names,
-                             std::vector<std::string> tensor_ids) {
-  for (int i = 0; i < tensor_ids.size(); i++) {
-    std::string tensor_id = tensor_ids[i];
-    tensors_.emplace(output_names[i], tensor_ids[i]);
-  }
-}
+void Compiler::RegisterOpFunc() {
+  VLOG(10) << "enter Compiler::RegisterOpFunc";
+#define INT_VEC std::vector<std::int64_t>
+#define FLOAT_VEC std::vector<float>
+#define FLOAT float
+#define INT std::int64_t
+#define BOOL bool
+#define STRING std::string
+#define STRING_VEC std::vector<std::string*>
+#define NONE
 
-void Compiler::InsertTensors(std::vector<std::string> output_names,
-                             std::string tensor_id) {
-  tensors_.insert(
-      std::pair<std::string, std::string>(output_names[0], tensor_id));
-}
+#define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
+#define POPART_CONST_ARG(Name) , const PopartConstant& Name
+#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
+#define POPART_ATTRIB_VEC_ARG(Name)
+#define BODY_ARG(Name) NONE
 
-void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
-                                const OpDesc* op_desc) {
-  // TODO(xiaobingw): replace ipu_index with macro or constexpr
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
-  auto tensor_ids_set =
-      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
-  if (op_desc->HasAttr("ipu_index")) {
-    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_index"));
-    builder_->virtualGraph(tensor_ids_set, ipu_index);
-    VLOG(10) << "set ipu_index= " << ipu_index
-             << " for op: " << op_desc->Type();
-    if (op_desc->HasAttr("ipu_stage")) {
-      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_stage"));
-      builder_->pipelineStage(tensor_ids_set, ipu_stage);
-      VLOG(10) << "set ipu_stage= " << ipu_stage
-               << " for op: " << op_desc->Type();
-    }
-  }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
-}
+  name_function_ = {
+#define OP_DECL(FuncName, OnnxImpl, Args)                     \
+  {#FuncName, [&](OpDesc* op_desc) {                          \
+     auto op_type = op_desc->Type();                          \
+     VLOG(10) << "build op:" << op_type << " args " << #Args; \
+     auto inputs = GetOpInputs(op_desc);                      \
+     auto output_names = GetOpOutputs(op_desc);               \
+     auto aiOnnxOpset1 = builder_->aiGraphcoreOpset1();       \
+     auto aiOnnxOpset = builder_->aiOnnxOpset11();            \
+     auto output_ids = OnnxImpl(inputs Args);                 \
+     SetIpuIndexStage(output_ids, op_desc);                   \
+     InsertTensors(output_names, output_ids);                 \
+   }},  // NOLINT
+#include "paddle/fluid/framework/ipu/supported_ops_autogen.h"
+  };
 
-void Compiler::SetIpuIndexStage(const std::string& tensor_id,
-                                const OpDesc* op_desc) {
-  VLOG(10) << "enter Compiler::SetIpuIndexStage";
-  if (op_desc->HasAttr("ipu_index")) {
-    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_index"));
-    builder_->virtualGraph(tensor_id, ipu_index);
-    VLOG(10) << "set ipu_index= " << ipu_index
-             << " for op: " << op_desc->Type();
-    if (op_desc->HasAttr("ipu_stage")) {
-      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_stage"));
-      builder_->pipelineStage(tensor_id, ipu_stage);
-      VLOG(10) << "set ipu_stage= " << ipu_stage
-               << " for op: " << op_desc->Type();
-    }
-  }
-  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+#undef OP_DECL
+// #undef OP_DECL_NO_RETURN
+#undef BODY_ARG
+#undef POPART_ATTRIB_VEC_ARG
+#undef HOST_SIDE_CONST_ARG
+#undef POPART_CONST_ARG
+#undef ARG
+#undef NONE
+#undef STRING_VEC
+#undef STRING
+#undef BOOL
+#undef INT
+#undef FLOAT
+#undef FLOAT_VEC
+#undef INT_VEC
+
+  // // self register ops
+  // #include "paddle/fluid/framework/ipu/supported_ops_custom.h"
+  //   name_function_.emplace("popart_reducemean", ReduceMeanHandler);
+  //   name_function_.emplace("popart_batchnormalization", BatchNormHandler);
+  //   name_function_.emplace("popart_constant", Constant);
+  //   name_function_.emplace("popart_nllloss", NllLoss);
+  //   name_function_.emplace("popart_groupnormalization", Groupnormalization);
+
+  //   // used for debug
+  //   for (auto elem : name_function_) {
+  //     VLOG(10) << "registered in map : " << elem.first << " second "
+  //              << &(elem.second);
+  //   }
 }
 
-template <typename T>
-T GetAttrAllowNull(std::string attr, OpDesc* op_desc) {
-  std::string type = typeid(T).name();
-  VLOG(1) << "body attr type is: " << type << " body attr name is: " << attr;
-  if (op_desc->HasAttr(attr)) {
-    return BOOST_GET_CONST(T, op_desc->GetAttr(attr));
-  } else {
-    VLOG(1) << "body attr not exist: " << type;
-    return {};
+void Compiler::LowerBody(const ir::Graph* graph) {
+  VLOG(10) << "enter Compiler::LowerBody";
+  auto nodes = ir::TopologySortOperations(*graph);
+  for (auto* node : nodes) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "node->type: " << op_type;
+
+    auto itr = name_function_.find(op_type);
+    if (itr != name_function_.end()) {
+      itr->second(node->Op());
+    } else if (op_type == "popart_constant") {
+      auto dims =
+          BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("dims"));
+      auto dtype_ = BOOST_GET_CONST(int, op_desc->GetAttr("dtype"));
+      auto dtype = OnnxDtype2PopartType(dtype_);
+      popart::TensorInfo tensor_info{dtype, dims};
+      auto value_attr = op_desc->GetAttr("value");
+      auto const_data = std::unique_ptr<popart::ConstVoidData>{};
+      switch (dtype) {
+        case popart::DataType::FLOAT:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<float>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT32:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::DOUBLE:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<double>, value_attr).data(),
+              tensor_info));
+          break;
+        case popart::DataType::INT64:
+          const_data.reset(new popart::ConstVoidData(
+              BOOST_GET_CONST(std::vector<int64_t>, value_attr).data(),
+              tensor_info));
+          break;
+        default:
+          PADDLE_THROW(
+              platform::errors::Unimplemented("popart::DataType %d", dtype));
+      }
+      popart::TensorId result = builder_->aiOnnxOpset11().constant(*const_data);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_reducemean") {
+      auto inputs = GetOpInputs(op_desc);
+      auto axes = nonstd::optional<std::vector<int64_t>>();
+      if (op_desc->HasAttr("axes")) {
+        axes = BOOST_GET_CONST(std::vector<int64_t>, op_desc->GetAttr("axes"));
+      }
+      auto keepdims = BOOST_GET_CONST(int64_t, op_desc->GetAttr("keepdims"));
+      popart::TensorId result =
+          builder_->aiOnnxOpset11().reducemean(inputs, axes, keepdims);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_batchnormalization") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto num_outputs = outputs.size();
+      auto epsilon = BOOST_GET_CONST(float, op_desc->GetAttr("epsilon"));
+      auto momentum = BOOST_GET_CONST(float, op_desc->GetAttr("momentum"));
+      auto result = builder_->aiOnnxOpset11().batchnormalization(
+          inputs, num_outputs, epsilon, momentum);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else if (op_type == "popart_nllloss") {
+      auto inputs = GetOpInputs(op_desc);
+      auto ignoreIndex = BOOST_GET_CONST(int, op_desc->GetAttr("ignoreIndex"));
+      auto result = builder_->aiGraphcoreOpset1().nllloss(
+          inputs, popart::ReductionType::NoReduction, ignoreIndex);
+      SetIpuIndexStage(result, op_desc);
+      InsertTensors(GetOpOutputs(op_desc), result);
+    } else {
+      PADDLE_THROW(platform::errors::NotFound("%s is not registered", op_type));
+    }
   }
+  VLOG(10) << "leave Compiler::LowerBody";
 }
 
 void Compiler::InitInputs(ir::Graph* graph,
                           const std::vector<std::string>& feed_list) {
   for (const auto& feed_name : feed_list) {
-    VLOG(1) << feed_name;
-
     for (const ir::Node* n : graph->Nodes()) {
       if (n->IsVar()) {
         auto* var_desc = n->Var();
         if (feed_name == var_desc->Name()) {
-          // Get tensor_info from var_desc
-          VLOG(1) << "feed_name= " << var_desc->Name();
+          VLOG(10) << "feed_name= " << var_desc->Name();
           auto data_type = VarType2PopartType(var_desc->GetDataType());
           popart::TensorInfo input_info{data_type, var_desc->GetShape()};
-          // Create popart tensor
-          VLOG(1) << "popart input_info = " << input_info;
+          VLOG(10) << "popart input_info = " << input_info;
           popart::TensorId tensor_id = builder_->addInputTensor(input_info);
-          VLOG(1) << "popart input tensor id = " << tensor_id;
+          VLOG(10) << "popart input tensor id = " << tensor_id;
           inputs_.push_back(tensor_id);
           tensors_.emplace(var_desc->Name(), tensor_id);
         }
@@ -125,8 +213,8 @@ void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
     PADDLE_ENFORCE_NE(tensor, tensors_.end(),
                       platform::errors::NotFound(
                           "output tensor %s does not exist.", fetch_name));
-    VLOG(1) << "fetch_name= " << fetch_name;
-    VLOG(1) << "popart output tensor id = " << tensor->second;
+    VLOG(10) << "fetch_name= " << fetch_name;
+    VLOG(10) << "popart output tensor id = " << tensor->second;
     builder_->addOutputTensor(tensor->second);
     outputs_.push_back(tensor->second);
   }
@@ -136,8 +224,7 @@ void Compiler::LowerWeights(const ir::Graph* graph, const Scope* scope_) {
   PADDLE_ENFORCE_NOT_NULL(scope_,
                           platform::errors::PreconditionNotMet(
                               "You should call set_scope before LowerWeights"));
-  // at this step, i think the graph doesn't contains optimizer
-  // related states
+  // at this step, the graph doesn't contains optimizer related states
   for (const auto* node : graph->Nodes()) {
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       if (node->Var()->Persistable()) {
@@ -161,98 +248,94 @@ void Compiler::LowerWeights(const ir::Graph* graph, const Scope* scope_) {
   }
 }
 
-std::vector<std::string> Compiler::GetOpInputs(const OpDesc* op) {
-  auto inputs_ = op->Input("__inputs__");
-  std::vector<std::string> inputs;
-  for (const auto& in : inputs_) {
-    if (tensors_.find(in) != tensors_.end()) {
-      inputs.push_back(tensors_[in]);
-    } else {
-      inputs.push_back(in);
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::vector<std::string>& tensor_ids) {
+  PADDLE_ENFORCE_EQ(output_names.size(), tensor_ids.size(),
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  for (int i = 0; i < tensor_ids.size(); i++) {
+    std::string tensor_id = tensor_ids[i];
+    tensors_.emplace(output_names[i], tensor_ids[i]);
+  }
+}
+
+void Compiler::InsertTensors(const std::vector<std::string>& output_names,
+                             const std::string& tensor_id) {
+  PADDLE_ENFORCE_EQ(output_names.size(), 1,
+                    platform::errors::Fatal("InsertTensors size mismatch"));
+  tensors_.emplace(output_names[0], tensor_id);
+}
+
+void Compiler::SetIpuIndexStage(const std::vector<std::string>& tensor_ids,
+                                const OpDesc* op_desc) {
+  // TODO(xiaobingw): replace ipu_index with macro or constexpr
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+  auto tensor_ids_set =
+      std::set<std::string>(tensor_ids.begin(), tensor_ids.end());
+  if (op_desc->HasAttr("ipu_index")) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_index"));
+    builder_->virtualGraph(tensor_ids_set, ipu_index);
+    VLOG(10) << "set ipu_index= " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr("ipu_stage")) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_stage"));
+      builder_->pipelineStage(tensor_ids_set, ipu_stage);
+      VLOG(10) << "set ipu_stage= " << ipu_stage
+               << " for op: " << op_desc->Type();
     }
   }
-  return inputs;
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
 }
 
-void Compiler::RegisterOpFunc() {
-  VLOG(1) << "enter Compiler::RegisterOpFunc";
-#define INT_VEC std::vector<std::int64_t>
-#define FLOAT_VEC std::vector<float>
-#define FLOAT float
-#define INT std::int64_t
-#define BOOL bool
-#define STRING std::string
-#define STRING_VEC std::vector<std::string*>
-#define NONE
+void Compiler::SetIpuIndexStage(const std::string& tensor_id,
+                                const OpDesc* op_desc) {
+  VLOG(10) << "enter Compiler::SetIpuIndexStage";
+  if (op_desc->HasAttr("ipu_index")) {
+    auto ipu_index = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_index"));
+    builder_->virtualGraph(tensor_id, ipu_index);
+    VLOG(10) << "set ipu_index= " << ipu_index
+             << " for op: " << op_desc->Type();
+    if (op_desc->HasAttr("ipu_stage")) {
+      auto ipu_stage = BOOST_GET_CONST(int, op_desc->GetAttr("ipu_stage"));
+      builder_->pipelineStage(tensor_id, ipu_stage);
+      VLOG(10) << "set ipu_stage= " << ipu_stage
+               << " for op: " << op_desc->Type();
+    }
+  }
+  VLOG(10) << "leave Compiler::SetIpuIndexStage";
+}
 
-#define ARG(Type, Name) , GetAttrAllowNull<Type>(#Name, op_desc)
-#define POPART_CONST_ARG(Name) , const PopartConstant& Name
-#define HOST_SIDE_CONST_ARG(Name) , const HostSideConstant& Name
-#define POPART_ATTRIB_VEC_ARG(Name)
-#define BODY_ARG(Name) NONE
+std::vector<int64_t> Compiler::GetTensorShape(const std::string& name) {
+  return builder_->getTensorShape(tensors_[name]);
+}
 
-  name_function_ = {
-#define OP_DECL(FuncName, OnnxImpl, Args)                    \
-  {#FuncName, [&](OpDesc* op_desc) {                         \
-     auto op_type = op_desc->Type();                         \
-     VLOG(1) << "build op:" << op_type << " args " << #Args; \
-     auto inputs = GetOpInputs(op_desc);                     \
-     auto output_names = op_desc->Output("__outputs__");     \
-     auto aiOnnxOpset1 = builder_->aiGraphcoreOpset1();      \
-     auto aiOnnxOpset = builder_->aiOnnxOpset11();           \
-     auto output_ids = OnnxImpl(inputs Args);                \
-     SetIpuIndexStage(output_ids, op_desc);                  \
-     InsertTensors(output_names, output_ids);                \
-   }},
-#include "paddle/fluid/framework/ipu/supported_ops_autogen.h"
-  };
+std::string Compiler::GetModelProto() { return builder_->getModelProto(); }
 
-#undef OP_DECL
-// #undef OP_DECL_NO_RETURN
-#undef BODY_ARG
-#undef POPART_ATTRIB_VEC_ARG
-#undef HOST_SIDE_CONST_ARG
-#undef POPART_CONST_ARG
-#undef ARG
-#undef NONE
-#undef STRING_VEC
-#undef STRING
-#undef BOOL
-#undef INT
-#undef FLOAT
-#undef FLOAT_VEC
-#undef INT_VEC
+void Compiler::SaveModelProto(const std::string& path) {
+  builder_->saveModelProto(path);
+}
 
-// self register ops
-#include "paddle/fluid/framework/ipu/supported_ops_custom.h"
-  name_function_.emplace("popart_reducemean", ReduceMeanHandler);
-  name_function_.emplace("popart_batchnormalization", BatchNormHandler);
-  name_function_.emplace("popart_constant", Constant);
-  name_function_.emplace("popart_nllloss", NllLoss);
-  name_function_.emplace("popart_groupnormalization", Groupnormalization);
+void Compiler::SaveModelProtoNoCheck(const std::string& path) {
+  auto proto = builder_->getModelProto();
+  std::ofstream onnxfile(path, std::ios_base::binary);
+  onnxfile.write(proto.data(), proto.size());
+  onnxfile.close();
 }
 
-void Compiler::LowerBody(const ir::Graph* graph) {
-  VLOG(10) << "enter Compiler::LowerBody";
-  // used for debug
-  for (auto elem : name_function_) {
-    VLOG(1) << "registered in map : " << elem.first << " second "
-            << &(elem.second);
-  }
-  auto nodes = paddle::framework::ir::TopologySortOperations(*graph);
-  for (auto* node : nodes) {
-    OpDesc* op = node->Op();
-    VLOG(1) << "node->type: " << op->Type();
-    PADDLE_ENFORCE_GT(
-        name_function_.count(op->Type()), 0,
-        platform::errors::NotFound(
-            "Do not found operator convert function, please make "
-            "sure it is registered in file \"supported_ops_autogen.h\" or "
-            "\"supported_ops_custom.h\""));
-    auto func = name_function_[op->Type()];
-    func(node->Op());
+std::vector<std::string> Compiler::GetOpInputs(const OpDesc* op) {
+  auto ins = op->Input("__inputs__");
+  std::vector<std::string> inputs;
+  for (const auto& in : ins) {
+    if (tensors_.find(in) != tensors_.end()) {
+      inputs.push_back(tensors_[in]);
+    } else {
+      inputs.push_back(in);
+    }
   }
-  VLOG(10) << "leave Compiler::LowerBody";
+  return inputs;
+}
+
+std::vector<std::string> Compiler::GetOpOutputs(const OpDesc* op) {
+  return op->Output("__outputs__");
 }
 
 }  // namespace ipu
diff --git a/paddle/fluid/framework/ipu/compiler.h b/paddle/fluid/framework/ipu/compiler.h
index 6f5c44b6b0ae6..edc45e1d0ea33 100644
--- a/paddle/fluid/framework/ipu/compiler.h
+++ b/paddle/fluid/framework/ipu/compiler.h
@@ -11,29 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #pragma once
-#include <popart/adam.hpp>
+
 #include <popart/builder.hpp>
-#include <popart/dataflow.hpp>
-#include <popart/devicemanager.hpp>
-#include <popart/names.hpp>
-#include <popart/ndarraywrapper.hpp>
-#include <popart/optimizer.hpp>
-#include <popart/session.hpp>
-#include <popart/sessionoptions.hpp>
-#include <popart/stepio.hpp>
-#include <popart/tensorinfo.hpp>
-#include <string>
-#include <type_traits>
-#include <unordered_map>
-#include <vector>
 
-#include "paddle/fluid/framework/ipu/ipu_strategy.h"
-#include "paddle/fluid/framework/ipu/ipu_utils.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace framework {
@@ -41,42 +25,45 @@ namespace ipu {
 
 class Compiler {
  public:
-  explicit Compiler(const IpuStrategy *ipu_strategy);
+  Compiler();
   ~Compiler();
+
+  void RegisterOpFunc();
+  void LowerBody(const ir::Graph *graph);
   void InitInputs(ir::Graph *graph, const std::vector<std::string> &feed_list);
   void InitOutputs(const std::vector<std::string> &fetch_list);
   void LowerWeights(const ir::Graph *graph, const Scope *scope_);
-  void RegisterOpFunc();
-  void LowerBody(const ir::Graph *graph);
-  std::vector<std::string> GetOpInputs(const OpDesc *op);
-
-  void InsertTensors(std::vector<std::string> output_names,
-                     std::vector<std::string> tensor_ids);
-  void InsertTensors(std::vector<std::string> output_names,
-                     std::string tensor_id);
 
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
   void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
                         const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id,
-                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
 
   std::vector<popart::TensorId> GetInputs() { return inputs_; }
   std::vector<popart::TensorId> GetOutputs() { return outputs_; }
   std::map<std::string, popart::TensorId> GetTensors() { return tensors_; }
-  std::vector<int64_t> GetTensorShape(std::string name) {
-    return builder_->getTensorShape(tensors_[name]);
-  }
-  std::string GetModelProto() { return builder_->getModelProto(); };
-  void SaveModelProto(std::string name) { builder_->saveModelProto(name); }
+  std::vector<int64_t> GetTensorShape(const std::string &name);
+
+  std::string GetModelProto();
+  void SaveModelProto(const std::string &path);
+  void SaveModelProtoNoCheck(const std::string &path);
+
+ private:
+  std::vector<std::string> GetOpInputs(const OpDesc *op);
+  std::vector<std::string> GetOpOutputs(const OpDesc *op);
 
  private:
-  std::map<std::string, popart::TensorId> tensors_;
   std::unique_ptr<popart::Builder> builder_;
-  const IpuStrategy *ipu_strategy_;
+
+  std::map<std::string, popart::TensorId> tensors_;
   std::vector<popart::TensorId> inputs_;
   std::vector<popart::TensorId> outputs_;
-  using Func = std::function<void(OpDesc *op_desc)>;
-  std::unordered_map<std::string, Func> name_function_;
+
+  using OpFunc = std::function<void(OpDesc *op_desc)>;
+  std::unordered_map<std::string, OpFunc> name_function_;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/framework/ipu/ipu_backend.cc b/paddle/fluid/framework/ipu/ipu_backend.cc
index f11581ee1de66..dee5b384499ad 100644
--- a/paddle/fluid/framework/ipu/ipu_backend.cc
+++ b/paddle/fluid/framework/ipu/ipu_backend.cc
@@ -38,75 +38,83 @@ namespace ipu {
 
 std::shared_ptr<IpuBackend> IpuBackend::instance_ = nullptr;
 
-IpuBackend::IpuBackend() {}
+IpuBackend::IpuBackend() { compiler_ = std::make_shared<Compiler>(); }
+
+IpuBackend::~IpuBackend() {
+  if (instance_ == nullptr) {
+    return;
+  }
+
+  // detach device
+  if (curr_device_ != nullptr && curr_device_->isAttached()) {
+    curr_device_->detach();
+  }
+}
+
+std::shared_ptr<IpuBackend> IpuBackend::GetInstance() {
+  if (!instance_) {
+    instance_.reset(new IpuBackend());
+  }
+  return instance_;
+}
 
 void IpuBackend::Compile(ir::Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
-  VLOG(1) << "-- in Compile --";
-  compiler_ = std::make_shared<Compiler>(ipu_strategy_);
+  VLOG(10) << "enter IpuBackend::Compile";
   compiler_->InitInputs(graph, feed_list);
   compiler_->LowerWeights(graph, scope_);
   compiler_->LowerBody(graph);
   compiler_->InitOutputs(fetch_list);
-  VLOG(1) << "-- fetch_list --";
-  for (const auto& fetch_name : fetch_list) {
-    VLOG(1) << fetch_name;
-  }
+  VLOG(10) << "leave IpuBackend::Compile";
 }
 
-std::unique_ptr<popart::Optimizer> IpuBackend::GetPopartOptimizer() {
-  // TODO(xiaobingw): change type_ to enum
-  PADDLE_ENFORCE_NE(
-      optimizer_.type_, "",
-      platform::errors::InvalidArgument("Optimizer type have not been set."));
-  if (optimizer_.type_ == "sgd") {
-    auto optimizer = std::make_unique<popart::SGD>(
-        popart::OptimizerValue(GetLRFromScope(), false),
-        popart::OptimizerValue(popart::SGD::getUnsetWeightDecay()),
-        popart::OptimizerValue(popart::SGD::getUnsetMomentum()),
-        popart::OptimizerValue(popart::SGD::getUnsetDampening()),
-        popart::OptimizerValue(popart::SGD::getUnsetVelocityScaling()),
-        popart::OptimizerValue(popart::SGD::getUnsetLossScaling()));
-    return optimizer;
-  } else if (optimizer_.type_ == "adam") {
-    auto optimizer = std::make_unique<popart::Adam>(
-        popart::OptimizerValue(GetLRFromScope(), false),
-        popart::OptimizerValue(popart::Adam::getUnsetWeightDecay()),
-        popart::OptimizerValue(GetOptimizerAttr("beta1"), false),
-        popart::OptimizerValue(GetOptimizerAttr("beta2"), false),
-        popart::OptimizerValue(GetOptimizerAttr("epsilon"), false),
-        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
-        popart::AdamMode::Adam, popart::WeightDecayMode::Decay,
-        popart::DataType::FLOAT, popart::DataType::FLOAT,
-        popart::DataType::FLOAT);
-    return optimizer;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Optimizer %s is not implemented now.", optimizer_.type_));
+void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
+                     const std::vector<Tensor*>& outputs) {
+  if (!is_prepared_) {
+    Prepare();
+    is_prepared_ = true;
   }
-}
 
-std::vector<int64_t> IpuBackend::GetTensorShape(const std::string& var_name) {
-  auto oshape = compiler_->GetTensorShape(var_name);
-  oshape.insert(oshape.begin(), ipu_strategy_->batches_per_step);
-  return oshape;
+  std::map<popart::TensorId, popart::IArray&> popart_inputs;
+  std::map<popart::TensorId, PaddleIArray> input_wrappers;
+  auto input_tensors = compiler_->GetInputs();
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto tensor_id = input_tensors[i];
+    auto tensor = const_cast<Tensor*>(inputs[i]);
+    input_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
+  }
+
+  std::map<popart::TensorId, popart::IArray&> popart_anchors;
+  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
+  auto output_tensors = compiler_->GetOutputs();
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto tensor_id = output_tensors[i];
+    auto tensor = const_cast<Tensor*>(outputs[i]);
+    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
+    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
+  }
+
+  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
+    VLOG(10) << "Update optimizer learning rate...";
+    auto popart_optimizer = GetPopartOptimizer();
+    auto session = dynamic_cast<popart::TrainingSession*>(session_.get());
+    session->updateOptimizerFromHost(popart_optimizer.get());
+  }
+
+  popart::StepIO stepio(popart_inputs, popart_anchors);
+  VLOG(10) << "Running...";
+  session_->run(stepio);
+  VLOG(10) << "Running...done";
 }
 
 void IpuBackend::Prepare() {
-  VLOG(1) << "Get ModelProto ...\n";
+  VLOG(10) << "Get ModelProto ...\n";
   auto proto = compiler_->GetModelProto();
-
-  // for onnx graph debug
-  // std::ofstream onnxfile("paddle_model_no_check.onnx",
-  // std::ios_base::binary);
-  // onnxfile.write(proto.data(), proto.size());
-  // onnxfile.close();
-
-  VLOG(1) << "Save Model to file paddle_model.onnx ...\n";
+  VLOG(10) << "Save Model to file paddle_model.onnx ...\n";
   compiler_->SaveModelProto("paddle_model.onnx");
-
-  VLOG(1) << "Constructing DataFlow\n";
+  VLOG(10) << "Constructing DataFlow\n";
   std::vector<popart::TensorId> anchor_ids;
   for (popart::TensorId item : compiler_->GetOutputs()) {
     anchor_ids.push_back(item);
@@ -119,7 +127,7 @@ void IpuBackend::Prepare() {
                                     "IpuBackend::AttachDevice(id) first."));
 
   if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
-    VLOG(1) << "Creating TrainingSession from Onnx Model...";
+    VLOG(10) << "Creating TrainingSession from Onnx Model...";
     auto popart_optimizer = GetPopartOptimizer();
     auto tensors = compiler_->GetTensors();
     auto it = tensors.find(optimizer_.loss_);
@@ -132,62 +140,71 @@ void IpuBackend::Prepare() {
         popart::InputShapeInfo(), ipu_strategy_->popart_options_,
         popart::Patterns(popart::PatternsLevel::Default));
   } else {
-    VLOG(1) << "Creating InferenceSession from Onnx Model...";
+    VLOG(10) << "Creating InferenceSession from Onnx Model...";
     session_ = popart::InferenceSession::createFromOnnxModel(
         proto, dataFlow, curr_device_, popart::InputShapeInfo(),
         ipu_strategy_->popart_options_,
         popart::Patterns(popart::PatternsLevel::Default));
   }
-  VLOG(1) << "Creating session from Onnx Model...done";
+  VLOG(10) << "Creating session from Onnx Model...done";
 
-  VLOG(1) << "Preparing session device...";
+  VLOG(10) << "Preparing session device...";
   session_->prepareDevice();
-  VLOG(1) << "Preparing session device...done";
+  VLOG(10) << "Preparing session device...done";
 
-  VLOG(1) << "Copy weights from host to device...";
+  VLOG(10) << "Copy weights from host to device...";
   session_->weightsFromHost();
-  VLOG(1) << "Copy weights from host to device...done";
+  VLOG(10) << "Copy weights from host to device...done";
 }
 
-void IpuBackend::Run(const std::vector<const Tensor*>& inputs,
-                     const std::vector<Tensor*>& outputs) {
-  if (!is_prepared_) {
-    Prepare();
-    is_prepared_ = true;
-  }
-
-  std::map<popart::TensorId, popart::IArray&> popart_inputs;
-  std::map<popart::TensorId, PaddleIArray> input_wrappers;
-  auto input_tensors = compiler_->GetInputs();
-  for (size_t i = 0; i < inputs.size(); i++) {
-    auto tensor_id = input_tensors[i];
-    auto tensor = const_cast<Tensor*>(inputs[i]);
-    input_wrappers.emplace(tensor_id, PaddleIArray(tensor));
-    popart_inputs.emplace(tensor_id, input_wrappers.at(tensor_id));
-  }
+std::vector<int64_t> IpuBackend::GetTensorShape(const std::string& var_name) {
+  auto oshape = compiler_->GetTensorShape(var_name);
+  oshape.insert(oshape.begin(), ipu_strategy_->batches_per_step);
+  return oshape;
+}
 
-  std::map<popart::TensorId, popart::IArray&> popart_anchors;
-  std::map<popart::TensorId, PaddleIArray> anchor_wrappers;
-  auto output_tensors = compiler_->GetOutputs();
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto tensor_id = output_tensors[i];
-    auto tensor = const_cast<Tensor*>(outputs[i]);
-    anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor));
-    popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id));
+std::unique_ptr<popart::Optimizer> IpuBackend::GetPopartOptimizer() {
+  // TODO(xiaobingw): change type_ to enum
+  PADDLE_ENFORCE_NE(
+      optimizer_.type_, "",
+      platform::errors::InvalidArgument("Optimizer type have not been set."));
+  if (optimizer_.type_ == "sgd") {
+    auto optimizer = std::make_unique<popart::SGD>(
+        popart::OptimizerValue(GetLRFromScope(), false),
+        popart::OptimizerValue(popart::SGD::getUnsetWeightDecay()),
+        popart::OptimizerValue(popart::SGD::getUnsetMomentum()),
+        popart::OptimizerValue(popart::SGD::getUnsetDampening()),
+        popart::OptimizerValue(popart::SGD::getUnsetVelocityScaling()),
+        popart::OptimizerValue(popart::SGD::getUnsetLossScaling()));
+    return optimizer;
+  } else if (optimizer_.type_ == "adam") {
+    auto optimizer = std::make_unique<popart::Adam>(
+        popart::OptimizerValue(GetLRFromScope(), false),
+        popart::OptimizerValue(popart::Adam::getUnsetWeightDecay()),
+        popart::OptimizerValue(GetOptimizerAttr("beta1"), false),
+        popart::OptimizerValue(GetOptimizerAttr("beta2"), false),
+        popart::OptimizerValue(GetOptimizerAttr("epsilon"), false),
+        popart::OptimizerValue(popart::Adam::getUnsetLossScaling()),
+        popart::AdamMode::Adam, popart::WeightDecayMode::Decay,
+        popart::DataType::FLOAT, popart::DataType::FLOAT,
+        popart::DataType::FLOAT);
+    return optimizer;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Optimizer %s is not implemented now.", optimizer_.type_));
   }
+}
 
-  if (ipu_strategy_ != nullptr && ipu_strategy_->is_training) {
-    VLOG(1) << "Update optimizer learning rate...";
-    auto popart_optimizer = GetPopartOptimizer();
-    auto session = dynamic_cast<popart::TrainingSession*>(session_.get());
-    session->updateOptimizerFromHost(popart_optimizer.get());
+float IpuBackend::GetOptimizerAttr(const std::string& attr,
+                                   float default_value) {
+  if (optimizer_.attrs_.count(attr) == 0) {
+    return default_value;
   }
+  return optimizer_.attrs_.at(attr);
+}
 
-  popart::StepIO stepio(popart_inputs, popart_anchors);
-
-  VLOG(1) << "Running...";
-  session_->run(stepio);
-  VLOG(1) << "Running...done";
+void IpuBackend::SetOptimizerAttr(const std::string& attr, float value) {
+  optimizer_.attrs_[attr] = value;
 }
 
 float IpuBackend::GetLRFromScope() {
@@ -201,17 +218,8 @@ float IpuBackend::GetLRFromScope() {
   return tensor.data<float>()[0];
 }
 
-// ipu_num_ must be pow(2,n);
-int IpuBackend::UpperIpuNum() {
-  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
-                    platform::errors::Unavailable(
-                        "The ipu num get is wrong, please make sure the "
-                        "sharding or pipline parameter is right."));
-  int i = 0;
-  while (pow(2, i) < ipu_strategy_->num_ipus) {
-    i++;
-  }
-  return pow(2, i);
+void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
+  ipu_strategy_ = &strategy;
 }
 
 size_t IpuBackend::GetNumDevices() {
@@ -274,7 +282,7 @@ Device IpuBackend::GetDevice(int id) {
 void IpuBackend::AttachDevice(int id) {
   // trick here
   // Compiler ipu is not same as the runtime ipu.
-  VLOG(1) << "comile ipu id = " << id;
+  VLOG(10) << "comile ipu id = " << id;
   bool ipu_model = GetBoolEnv("POPLAR_IPUMODEL");
   if (ipu_model) {
     return;
@@ -287,17 +295,20 @@ void IpuBackend::AttachDevice(int id) {
                         "Can't attach IPU, ipu_num = %d.", UpperIpuNum()));
 }
 
-IpuBackend::~IpuBackend() {
-  if (instance_ == nullptr) {
-    return;
-  }
+bool IpuBackend::DeviceIsAttached() { return curr_device_ != nullptr; }
 
-  // detach device
-  if (curr_device_ != nullptr && curr_device_->isAttached()) {
-    curr_device_->detach();
+// ipu_num_ must be pow(2,n);
+int IpuBackend::UpperIpuNum() {
+  PADDLE_ENFORCE_GT(ipu_strategy_->num_ipus, 0,
+                    platform::errors::Unavailable(
+                        "The ipu num get is wrong, please make sure the "
+                        "sharding or pipline parameter is right."));
+  int i = 0;
+  while (pow(2, i) < ipu_strategy_->num_ipus) {
+    i++;
   }
+  return pow(2, i);
 }
-bool IpuBackend::DeviceIsAttached() { return curr_device_ != nullptr; }
 
 }  // namespace ipu
 }  // namespace framework
diff --git a/paddle/fluid/framework/ipu/ipu_backend.h b/paddle/fluid/framework/ipu/ipu_backend.h
index 13721d3d6b12f..7b787b568267d 100644
--- a/paddle/fluid/framework/ipu/ipu_backend.h
+++ b/paddle/fluid/framework/ipu/ipu_backend.h
@@ -52,74 +52,50 @@ class IpuBackend {
   IpuBackend();
   ~IpuBackend();
 
+  static std::shared_ptr<IpuBackend> GetInstance();
+
   void Compile(ir::Graph *graph, const std::vector<std::string> &feed_list,
                const std::vector<std::string> &fetch_list);
-
   void Run(const std::vector<const Tensor *> &inputs,
            const std::vector<Tensor *> &outputs);
 
-  std::string GetOptimizerType() { return optimizer_.type_; }
-
-  void SetOptimizerType(const std::string &type) { optimizer_.type_ = type; }
-
-  float GetOptimizerAttr(const std::string &name, float default_value = 0.0f) {
-    if (optimizer_.attrs_.count(name) == 0) {
-      return default_value;
-    }
-    return optimizer_.attrs_.at(name);
-  }
-
-  void SetOptimizerAttr(const std::string &attr, float value) {
-    optimizer_.attrs_[attr] = value;
-  }
-
-  void SetLoss(const std::string &loss) { optimizer_.loss_ = loss; }
-
-  std::unique_ptr<popart::Optimizer> GetPopartOptimizer();
-
   std::vector<int64_t> GetTensorShape(const std::string &var_name);
-
-  // SetScope, so we can get model parameters from scope
   void SetScope(const Scope &scope) { scope_ = &scope; }
 
+  // Optimizer
+  std::unique_ptr<popart::Optimizer> GetPopartOptimizer();
+  std::string GetOptimizerType() { return optimizer_.type_; }
+  void SetOptimizerType(const std::string &type) { optimizer_.type_ = type; }
+  float GetOptimizerAttr(const std::string &attr, float default_value = 0.0f);
+  void SetOptimizerAttr(const std::string &attr, float value);
+  void SetLoss(const std::string &loss) { optimizer_.loss_ = loss; }
   void SetLRVarName(const std::string &name) { optimizer_.lr_var_name_ = name; }
 
-  // get fixed and adjustable learning rate from scope
-  float GetLRFromScope();
-
-  void SetIpuStrategy(const IpuStrategy &strategy) {
-    ipu_strategy_ = &strategy;
-  }
-  int UpperIpuNum();
+  // IpuStrategy
+  void SetIpuStrategy(const IpuStrategy &strategy);
   size_t GetNumDevices();
   std::vector<int> GetDeviceIds();
   Device GetDevice(int id);
   void AttachDevice(int id);
   bool DeviceIsAttached();
 
-  static std::shared_ptr<IpuBackend> GetInstance() {
-    if (NULL == instance_) {
-      instance_.reset(new IpuBackend());
-    }
-    return instance_;
-  }
-
  private:
   void Prepare();
-  void LowerWeights(const ir::Graph *);
-  void LowerBody(const ir::Graph *);
-  std::vector<std::string> GetOpInputs(const OpDesc *op);
+  float GetLRFromScope();
+  int UpperIpuNum();
 
  private:
+  static std::shared_ptr<IpuBackend> instance_;
+  std::shared_ptr<Compiler> compiler_;
+
   Optimizer optimizer_;
+  std::unique_ptr<popart::Session> session_;
+  std::shared_ptr<popart::DeviceInfo> curr_device_;
   bool is_prepared_ = false;
+
+  // not own
   const Scope *scope_ = nullptr;
   const IpuStrategy *ipu_strategy_ = nullptr;
-
-  std::unique_ptr<popart::Session> session_;
-  std::shared_ptr<popart::DeviceInfo> curr_device_;
-  static std::shared_ptr<IpuBackend> instance_;
-  std::shared_ptr<Compiler> compiler_;
 };
 
 }  // namespace ipu
diff --git a/paddle/fluid/framework/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/framework/ipu/popart_canonicalization/nn_ops.cc
index 0559841789073..5af57621d8fcb 100644
--- a/paddle/fluid/framework/ipu/popart_canonicalization/nn_ops.cc
+++ b/paddle/fluid/framework/ipu/popart_canonicalization/nn_ops.cc
@@ -53,6 +53,7 @@ Node *conv2d_handler(Graph *graph, Node *node) {
 }
 
 Node *batch_norm_handler(Graph *graph, Node *node) {
+  // TODO(alleng) differ from trainning & inference
   auto *op = node->Op();
   std::vector<Node *> inputs;
   inputs.push_back(GetInputNode("X", node));
@@ -66,7 +67,7 @@ Node *batch_norm_handler(Graph *graph, Node *node) {
   outputs.push_back(GetOutputNode("VarianceOut", node));
   outputs.push_back(GetOutputNode("SavedMean", node));
   outputs.push_back(GetOutputNode("SavedVariance", node));
-  outputs.push_back(GetOutputNode("ReserveSpace", node));
+  // outputs.push_back(GetOutputNode("ReserveSpace", node));
   auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum"));
   auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon"));
   // data_layout
diff --git a/paddle/fluid/framework/ipu/supported_ops_autogen.h b/paddle/fluid/framework/ipu/supported_ops_autogen.h
index 41411df5e56bc..dc3fb94add049 100644
--- a/paddle/fluid/framework/ipu/supported_ops_autogen.h
+++ b/paddle/fluid/framework/ipu/supported_ops_autogen.h
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO(alleng) add `//clang-format off` in python
+// TODO(alleng) add `//clang-format off` in python, add `//NOLINT` for each op
 // clang-format off
 // Ops from AiGraphcoreOpset1
 OP_DECL(popart_gelu,aiOnnxOpset1.gelu, NONE)
-// OP_DECL(popart_groupnormalization, aiOnnxOpset1.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon))
+OP_DECL(popart_groupnormalization, aiOnnxOpset1.groupnormalization, ARG(INT,num_groups) ARG(FLOAT,epsilon))
 //
 OP_DECL(popart_reshape,aiOnnxOpset.reshape, NONE)
 // Ops from AiOnnxOpset10