From 4c77a9086c488a9a0b11d4e7f0c406c31716345e Mon Sep 17 00:00:00 2001
From: Baibaifan <39549453+Baibaifan@users.noreply.github.com>
Date: Fri, 14 Jan 2022 15:38:49 +0800
Subject: [PATCH 01/10] Add dygraph sharding stage3 (#38052)

---
 paddle/pten/core/dense_tensor.cc              |   4 +
 .../meta_parallel/sharding/sharding_stage3.py | 675 ++++++++++++++++++
 .../meta_parallel/sharding/sharding_utils.py  |  31 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   3 +
 .../unittests/dygraph_sharding_stage3.py      | 233 ++++++
 .../unittests/test_dygraph_sharding_stage3.py |  31 +
 6 files changed, 960 insertions(+), 17 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py

diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 0b5f5cb18e13d..eb6f834d72779 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -435,6 +435,10 @@ inline T* DenseTensor::mutable_data(const paddle::platform::Place& place,
 }
 
 void DenseTensor::ShareBufferWith(const DenseTensor& tensor) {
+  if (storage_ == nullptr) {
+    storage_ = make_intrusive<paddle::experimental::SharedStorage>(
+        paddle::platform::CPUPlace());
+  }
   if (storage_ != nullptr && tensor.storage_ != nullptr) {
     storage_->set_data_shared(tensor.storage_->data_shared());
   }
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
new file mode 100644
index 0000000000000..e5d04aac1551e
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -0,0 +1,675 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import copy
+import time
+import contextlib
+import logging
+import functools
+import numpy as np
+from itertools import chain
+from functools import reduce
+from types import MethodType
+from collections import deque, OrderedDict
+
+import paddle
+from paddle import nn
+from paddle.autograd import PyLayer
+import paddle.fluid.core as core
+import paddle.distributed as dist
+from paddle.fluid.framework import ParamBase
+from paddle.fluid.clip import ClipGradByGlobalNorm
+from paddle.distributed.collective import _get_global_group
+
+from .sharding_utils import Type, ShardingClipGrad
+from ..pp_utils.utils import _all_gather
+
+# CUDA alignment 256 bytes
+alignment = {"gpu": 256, }
+align = {
+    Type.fp16.value: 2,
+    Type.fp32.value: 4,
+}
+
+global CHECK_LAYER
+CHECK_LAYER = dict()  # Help to check layer's id -> layer's name
+
+
+class ShardingStage3(nn.Layer):
+    """ 
+    A wrapper for Sharding Stage3 Layer in Dygraph. 
+
+    .. warning: ShardingStage3 encapsulates the layer strategy and integrates it into the nn.Layer.
+
+    .. ZeRO: https://arxiv.org/pdf/1910.02054.pdf.
+    """
+
+    def __init__(self,
+                 layer,
+                 optimizer,
+                 group=None,
+                 sync_buffers=False,
+                 device="gpu",
+                 pertrain_sync_models=True,
+                 accumulate_grads=False,
+                 offload=False,
+                 sync_comm=False):
+        super().__init__()
+
+        # Default configs
+        assert core.is_compiled_with_cuda(), "Only support CUDA."
+        self._layer = layer
+        self._default_device = device
+        self.__sync_buffers = sync_buffers
+        self._accumulate_grads = accumulate_grads
+        self._offload = offload
+        self._sync_comm = sync_comm
+
+        # Communication group establishment
+        self._group = dist.new_group(_get_global_group()
+                                     .ranks) if group is None else group
+        self._world_size_scaling = 1.0 / self._group.nranks
+        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
+        self._rank = self._group.rank
+        self._global_root_rank = 0  # picking rank 0 as the reference
+        self._global_ranks = self._group.ranks
+        self._param2buffer_size = dict()  # {param.name: size}
+        self._param2buffer = dict(
+        )  # {param.name: [(start0, end0),(start1, end1), ...]}
+        self._trainable_params = dict()  # {layer.name: [trainable_params]}
+
+        assert not isinstance(
+            optimizer, list), "Multiple optimizers are not supported now."
+        self._optim = _OptimizerWrapper(optimizer, self._offload, self._group,
+                                        self._update_params_slice)
+        self._ori_parameter_list = self._optim._parameter_list
+        self._ori_param_groups = self._optim._param_groups
+
+        # Replace optimizer's _grad_clip
+        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
+            logging.warning(
+                "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed."
+            )
+            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
+                                                      paddle.get_device(),
+                                                      self._group)
+
+        # Synchronous all ranks models
+        if pertrain_sync_models:
+            self._sync_params_and_buffers()
+
+        self._segment_rank_params(self._layer)
+
+        # In the first step, record the execution order of the layer
+        self._order_tracer = OrderedDict()
+        self._order_tracer["order"] = 0
+        self._order_tracer["layer"] = []
+        # Register task flow
+        self._task_flow = TaskFlow()
+        # Register forward hooks
+        self._register_forward_hooks(self._layer)
+        # Register backward parameter hooks
+        self._register_backward_hooks()
+        # Redefine optimizer step and clear function
+        self._redefine_opt_step()
+        self._redefine_opt_clear()
+
+    @paddle.no_grad()
+    def _sync_params_and_buffers(self):
+        """
+        Sync all model states for all ranks
+        """
+
+        for p in self._layer.parameters():
+            dist.broadcast(
+                p,
+                src=self._global_root_rank,
+                group=self._group,
+                use_calc_stream=True)
+
+        # Multi stream operation will be supported later
+        dist.wait(tensor=p, group=self._group, use_calc_stream=True)
+
+    def _clear_gradients(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param, "fw_storage"
+            ), "Find {} don't have fw_storage attribute.".format(param.name)
+
+            # param.bw_storage.zero_()
+            param.fw_storage.clear_gradient(False)
+            param.fw_storage._gradient_set_empty(False)
+            param.bw_storage._clear()
+
+    # Update param memery slice
+    def _update_params_slice(self):
+        update_list = self._update_params()
+
+        if not isinstance(self._optim._param_groups[0], dict):
+            slice_params = [param.fw_storage for param in update_list]
+            self._optim._parameter_list = slice_params
+            self._optim._param_groups = slice_params
+        else:
+            params_name_list = list(map(lambda p: p.name, update_list))
+            for param_group in self._optim._param_groups:
+                slice_p = []
+                for p in param_group['params']:
+                    if p.name in params_name_list:
+                        assert hasattr(
+                            p, "fw_storage"
+                        ), "Find {} don't have fw_storage attribute.".format(
+                            p.name)
+                        slice_p.append(p.fw_storage)
+                    param_group['params'] = slice_p
+
+    def forward(self, *inputs, **kwargs):
+        """
+        A wrapper for Sharding Stage3 layer.
+        """
+        # 1.Sync layer's buffers state
+        if self.__sync_buffers:
+            self._sync_buffers()
+
+        # 2.Normal FW on the base model
+        fw = self._layer(*inputs, **kwargs)
+
+        return fw
+
+    def _segment_rank_params(self, layer, name="last_layer"):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            CHECK_LAYER[id(layer)] = name
+            self._flatten_layer_params(layer, current_layer_params)
+
+        for name, sub_layer in layer.named_children():
+            self._segment_rank_params(sub_layer, name)
+
+    def _flatten_layer_params(self, layer, current_layer_params):
+        def _add_manage_info(trainable_param):
+            return _PartitionParam(trainable_param)
+
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        assert id(layer) not in self._trainable_params.keys()
+        self._trainable_params[id(layer)] = list(
+            map(_add_manage_info, trainable_params))
+
+        for param in self._trainable_params[id(layer)]:
+            if param.name in self._param2buffer.keys():
+                continue
+            self._param2buffer[param.name] = []
+            # 1.Params alignment
+            offset = 0
+            # CUDA alignment 256 bytes
+            size = param._numel() * align[param.dtype]
+            remaining = size % alignment[self._default_device]
+            ali = 0 if remaining == 0 else alignment[
+                self._default_device] - remaining
+            align_ = ali // align[param.dtype]
+
+            offset = align_ + param._numel()
+            buffer_size = offset if offset % self._group.nranks == 0 else offset + self._group.nranks - (
+                offset % self._group.nranks)
+            self._param2buffer_size[param.name] = buffer_size
+
+            # 2.Combination param buffer
+            assert buffer_size % self._group.nranks == 0
+            pre_buffer = buffer_size // self._group.nranks
+
+            for rank_ in range(self._group.nranks):
+                self._param2buffer[param.name].append(
+                    (rank_ * pre_buffer, (rank_ + 1) * pre_buffer))
+
+            # 3.Flatten layer params and release other rank buffer
+            self._param_storage(param, buffer_size)
+
+    def _param_storage(self, param, buffer_size):
+        assert isinstance(buffer_size, int)
+        value = np.zeros(
+            buffer_size,
+            dtype=np.float16) if Type.fp16.value == param.dtype else np.zeros(
+                buffer_size, dtype=np.float32)
+        buffer = core.VarBase(value=value, place=core.CPUPlace())
+
+        param_shape = param.shape
+        origin_state = param.stop_gradient
+        param.stop_gradient = True
+        param.flatten_()
+        param.stop_gradient = origin_state
+        start, end = self._param2buffer[param.name][self._rank]
+
+        # Copy the current param value
+        tmp_var = core.VarBase(
+            tensor=buffer._slice(0, param._numel()), place=core.CPUPlace())
+        param_cpu = param.cpu()
+        tmp_var.value().get_tensor().set(param_cpu.value().get_tensor(),
+                                         core.CPUPlace())
+        param.value().get_tensor()._set_dims(param_shape)
+        param._clear()
+
+        # Current rank param_storage
+        param.fw_storage = core.VarBase(
+            buffer._slice(start, end), "slice@" + param.name)
+        param.status = "part"
+
+        # Updata optimizer master weights
+        if param.dtype == Type.fp16.value:
+            self._optim._master_weights[param.fw_storage.name] = paddle.cast(
+                param.fw_storage, Type.fp32.value)
+
+    def _register_forward_hooks(self, layer):
+        current_layer_params = _current_layer_params(layer)
+        if current_layer_params:
+            self._register_forward_all_hooks(layer, self._task_flow)
+
+        for _, sub_layer in layer.named_children():
+            self._register_forward_hooks(sub_layer)
+
+    def _register_forward_all_hooks(self, sub_layer, task_flow):
+        def _forward_pre_hook(layer, inputs):
+            return ForwardPreHooks(layer, self._order_tracer,
+                                   self._trainable_params, self._param2buffer,
+                                   self._rank, self._group, self._sync_comm,
+                                   task_flow)
+
+        def _forward_post_hook(layer, inputs, outputs):
+            return ForwardPostHooks.apply(
+                outputs, layer, self._order_tracer, self._trainable_params,
+                self._param2buffer, self._param2buffer_size, self._rank,
+                self._group, self._sync_comm, task_flow)
+
+        # register previous forward hooks
+        sub_layer.register_forward_pre_hook(_forward_pre_hook)
+
+        # register post forward hooks
+        sub_layer.register_forward_post_hook(_forward_post_hook)
+
+    @paddle.no_grad()
+    def _sync_buffers(self):
+        for buffer in self._layer.buffers(include_sublayers=True):
+            dist.broadcast(
+                buffer,
+                self._global_root_rank,
+                self._group,
+                use_calc_stream=True)
+        # Multi stream operation will be supported later
+        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
+
+    def __getattr__(self, name):
+        """Forward missing attributes to wrapped layer."""
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self._layer, name)
+
+    def _update_params(self):
+        update_list = []
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            if self._accumulate_grads:
+                param.bw_storage.scale_(scale=self._world_size_scaling)
+            param.fw_storage = _VarBaseWrapper(param)
+            param.fw_storage._copy_gradient_from(param.bw_storage)
+            update_list.append(param)
+        return update_list
+
+    def get_all_parameters(self):
+        assert len(self._trainable_params.keys()) > 0
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+        for param in trainable_params:
+            if param.use_count > 0:
+                continue
+            assert hasattr(
+                param,
+                "fw_storage"), "Find {} don't have fw_storage attribute".format(
+                    param.name)
+
+            full_param = _all_gather(
+                param.fw_storage, self._group, use_calc_stream=True)
+            dist.wait(
+                tensor=full_param, group=self._group, use_calc_stream=True)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+
+        self._optim._parameter_list = self._ori_parameter_list
+        self._optim._param_groups = self._ori_param_groups
+
+    def _register_backward_hooks(self):
+        current_layer_params = self._layer.parameters(include_sublayers=True)
+        trainable_params = list(
+            filter(lambda x: x.trainable, current_layer_params))
+
+        for param in trainable_params:
+            allreduce_function = self._get_allreduce_fn(param)
+            param._register_backward_hook(allreduce_function)
+
+    def _get_allreduce_fn(self, param):
+        @paddle.no_grad()
+        def reduce(*_):
+            if param.name in self._task_flow.full_grad.keys():
+                full_grad = self._task_flow.full_grad[param.name]
+                with paddle.amp.auto_cast(enable=False):
+                    if not self._accumulate_grads:
+                        full_grad.scale_(scale=self._world_size_scaling)
+                    # Only support sync allreduce current rank's layer now
+                    dist.all_reduce(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+                    dist.wait(
+                        tensor=full_grad,
+                        group=self._group,
+                        use_calc_stream=True)
+
+                    start, end = self._param2buffer[param.name][self._rank]
+                    if not self._accumulate_grads or param.bw_storage is None:
+                        param.bw_storage = core.VarBase(
+                            full_grad._slice(start, end)).detach().clone()
+                    else:
+                        param.bw_storage.add_(
+                            core.VarBase(full_grad._slice(start, end)).detach()
+                            .clone())
+                param.clear_gradient(False)
+                param._gradient_set_empty(False)
+                tmp_var = self._task_flow.full_grad.pop(param.name)
+                tmp_var._clear()
+
+            if param.name in self._task_flow.full_param.keys():
+                if param.status == "all":
+                    param.use_count = 0
+                    param._clear()
+                    start, end = self._param2buffer[param.name][self._rank]
+                    with paddle.amp.auto_cast(enable=False):
+                        param.fw_storage = core.VarBase(
+                            self._task_flow.full_param[param.name]._slice(start,
+                                                                          end),
+                            param.name + "@slice").detach().clone()
+                    param.status = "part"
+                    tmp_var = self._task_flow.full_param.pop(param.name)
+                    tmp_var._clear()
+
+        return reduce
+
+    def _redefine_opt_step(self):
+        params_slice_func = self._update_params_slice
+        opt_step = self._optim.step
+        update_scaler = self._optim.update_scaler
+
+        def _opt_step(self):
+            if not update_scaler:
+                params_slice_func()
+            opt_step()
+
+        self._optim.step = MethodType(_opt_step, self._optim)
+
+    def _redefine_opt_clear(self):
+        clear_func = self._clear_gradients
+
+        def _opt_clear(self):
+            clear_func()
+
+        self._optim.clear_grad = MethodType(_opt_clear, self._optim)
+
+
+def ForwardPreHooks(layer, order_tracer, trainable_params, param2buffer, rank,
+                    group, sync_comm, task_flow):
+
+    # Record layer's id
+    layer_id = id(layer)
+    use_calc, sync_wait = False, False
+
+    if layer_id not in order_tracer.keys() or sync_comm:
+        use_calc, sync_wait = True, True
+        task_flow.use_calc[layer_id] = use_calc
+    else:
+        task_flow.use_calc[layer_id] = use_calc
+        _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+
+        if layer_id == order_tracer["layer"][-1]: return
+        order_ = order_tracer[layer_id]
+        layer_id = order_tracer["layer"][order_ + 1]
+    _allgather_buffer(
+        layer_id,
+        trainable_params,
+        group,
+        use_calc_stream=use_calc,
+        task_flow=task_flow,
+        sync_wait=sync_wait)
+    return
+
+
+class ForwardPostHooks(PyLayer):
+    @staticmethod
+    def forward(ctx, inputs, layer, order_tracer, trainable_params,
+                param2buffer, param2buffer_size, rank, group, sync_comm,
+                task_flow):
+        _release_param(layer, trainable_params, param2buffer, rank, task_flow)
+
+        layer_id = id(layer)
+        if layer_id not in order_tracer.keys():
+            order_ = order_tracer["order"]
+            order_tracer[layer_id] = order_
+            order_tracer["order"] += 1
+            order_tracer["layer"].append(layer_id)
+        ctx.order_tracer = order_tracer
+        ctx.task_flow = task_flow
+        ctx.group = group
+        ctx.layer = layer
+        ctx.sync_comm = sync_comm
+        ctx.trainable_params = trainable_params
+        ctx.param2buffer_size = param2buffer_size
+
+        return inputs
+
+    @staticmethod
+    def backward(ctx, *args):
+        # Load context value
+        order_tracer = ctx.order_tracer
+        task_flow = ctx.task_flow
+        group = ctx.group
+        layer = ctx.layer
+        trainable_params = ctx.trainable_params
+        param2buffer_size = ctx.param2buffer_size
+        sync_comm = ctx.sync_comm
+        layer_id = id(layer)
+        use_calc, sync_wait = False, False
+        if sync_comm:
+            use_calc, sync_wait = True, True
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+        else:
+            _wait_layer(trainable_params, layer_id, task_flow, group, use_calc)
+        _create_params_grad(layer, trainable_params, param2buffer_size,
+                            task_flow)
+        task_flow.use_calc[layer_id] = use_calc
+        if layer_id != order_tracer["layer"][0] and not sync_comm:
+            layer_next_id = order_tracer["layer"][order_tracer[layer_id] - 1]
+            _allgather_buffer(
+                layer_next_id,
+                trainable_params,
+                group,
+                use_calc_stream=use_calc,
+                task_flow=task_flow,
+                sync_wait=sync_wait)
+
+        return args
+
+
+class TaskFlow:
+    """
+    Task flows, one way linked list for task acquisition.
+    """
+
+    def __init__(self,
+                 full_param=dict(),
+                 full_grad=dict(),
+                 use_calc=dict(),
+                 callback=None):
+        self.full_param = full_param
+        self.full_grad = full_grad
+        self.use_calc = use_calc
+        self.callback = callback
+
+
+def _release_param(layer, trainable_params, param2buffer, rank, task_flow):
+    for param in trainable_params[id(layer)]:
+        # async communicate share weight not clear
+        param.use_count -= 1
+        if param.use_count == 0:
+            param._clear()
+            if param.name in task_flow.full_param.keys():
+                start, end = param2buffer[param.name][rank]
+                with paddle.amp.auto_cast(enable=False):
+                    param.fw_storage = core.VarBase(
+                        task_flow.full_param[param.name]._slice(start, end),
+                        param.name + "@slice").detach().clone()
+                param.status = "part"
+                tmp_var = task_flow.full_param.pop(param.name)
+                tmp_var._clear()
+    return
+
+
+def _wait_layer(trainable_params, layer_id, task_flow, group, use_calc_stream):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        if param.name in task_flow.full_param.keys():
+            full_param = task_flow.full_param[param.name]
+            with paddle.amp.auto_cast(enable=False):
+                paddle.device.cuda.synchronize()
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        else:
+            _allgather_buffer(
+                layer_id,
+                trainable_params,
+                group,
+                use_calc_stream,
+                task_flow,
+                sync_wait=True)
+            break
+    return task_flow
+
+
+def _allgather_buffer(layer_id,
+                      trainable_params,
+                      group,
+                      use_calc_stream,
+                      task_flow,
+                      sync_wait=False):
+    for param in trainable_params[layer_id]:
+        if param.status == "all":
+            param.use_count += 1
+            continue
+        with paddle.amp.auto_cast(enable=False):
+            full_param = _all_gather(
+                param.fw_storage, group, use_calc_stream=use_calc_stream)
+        if sync_wait:
+            with paddle.amp.auto_cast(enable=False):
+                dist.wait(
+                    tensor=full_param,
+                    group=group,
+                    use_calc_stream=use_calc_stream)
+            core.VarBase(full_param._slice(0, param._numel()))._share_buffer_to(
+                param)
+            param.value().get_tensor()._set_dims(param.shape)
+            param.fw_storage._clear()
+            param.fw_storage = None
+            param.status = "all"
+            param.use_count += 1
+        task_flow.full_param[param.name] = full_param
+    return task_flow
+
+
+@paddle.no_grad()
+def _create_params_grad(layer, trainable_params, param2buffer_size, task_flow):
+    for param in trainable_params[id(layer)]:
+        if param.name in task_flow.full_grad.keys():
+            continue
+        assert isinstance(param2buffer_size[param.name], int)
+        temp_grad = paddle.zeros(
+            [param2buffer_size[param.name]], dtype=param.dtype)
+        param._copy_gradient_from(
+            core.VarBase(temp_grad._slice(0, param._numel())))
+        task_flow.full_grad[param.name] = temp_grad
+    return task_flow
+
+
+def _PartitionParam(param):
+    if not hasattr(param, "fw_storage"):
+        setattr(param, "fw_storage", None)
+        setattr(param, "bw_storage", None)
+        setattr(param, "status", "all")
+        setattr(param, "use_count", 0)
+    return param
+
+
+def _VarBaseWrapper(param):
+    varbase = param.fw_storage
+    tmp_param = ParamBase(
+        shape=varbase.shape, dtype=varbase.dtype, name="slice@" + param.name)
+    varbase._share_buffer_to(tmp_param)
+    tmp_param.regularizer = param.regularizer
+    tmp_param.optimize_attr['learning_rate'] = param.optimize_attr[
+        'learning_rate']
+    varbase._clear()
+    return tmp_param
+
+
+def _OptimizerWrapper(optimizer, offload, group, update_params_slice):
+    if not hasattr(optimizer, "_optim"):
+        setattr(optimizer, "_optim", optimizer)
+        setattr(optimizer, "offload", offload)
+        setattr(optimizer, "group", group)
+        setattr(optimizer, "update_scaler", None)
+        setattr(optimizer, "update_slice", update_params_slice)
+    return optimizer
+
+
+def _current_layer_params(layer):
+    return layer.parameters(
+        include_sublayers=False) + list(layer.extra_parameters) if hasattr(
+            layer, "extra_parameters") else layer.parameters(
+                include_sublayers=False)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index 272aada576be8..5f696195c1abc 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -152,6 +152,9 @@ def unscale_method(self, optimizer):
         param_grads = []
         param_grads_fp16 = []
         param_grads_fp32 = []
+        if hasattr(optimizer, "update_slice"):
+            optimizer.update_slice()
+            optimizer.update_scaler = True
 
         if getattr(optimizer._optim, '_param_groups', None) and isinstance(
                 optimizer._optim._param_groups[0], dict):
@@ -161,27 +164,21 @@ def unscale_method(self, optimizer):
                     if param._grad_ivar() is not None:
                         param_grads.append(param._grad_ivar())
                         if param._grad_ivar(
-                        ).dtype == core.VarDesc.VarType.FP16:
+                        ).dtype in [core.VarDesc.VarType.FP16, paddle.float16]:
                             param_grads_fp16.append(param._grad_ivar())
                         else:
                             param_grads_fp32.append(param._grad_ivar())
         else:
-            param_grads = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if param._grad_ivar() is not None
-            ]
-            param_grads_fp16 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16
-                           )
-            ]
-            param_grads_fp32 = [
-                param._grad_ivar() for param in optimizer._optim._parameter_list
-                if (param._grad_ivar() is not None
-                    ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32
-                           )
-            ]
+            for param in optimizer._optim._parameter_list:
+                if param.grad is not None:
+                    param_grads.append(param.grad)
+                    if param.grad.dtype in [
+                            core.VarDesc.VarType.FP16, paddle.float16
+                    ]:
+                        param_grads_fp16.append(param.grad)
+                    else:
+                        param_grads_fp32.append(param.grad)
+
         temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool))
         temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool))
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 67697fcfd8398..c0c13866ccd55 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -34,6 +34,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_optimizer_stage2)
 list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage2)
+list(APPEND DIST_TEST_OPS test_dygraph_sharding_stage3)
 list(APPEND DIST_TEST_OPS test_auto_parallel_parallelizer)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
 list(APPEND DIST_TEST_OPS test_hybrid_parallel_inference_helper)
@@ -250,6 +251,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sharding_parallel)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_optimizer_stage2)
     list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage2)
+    list(REMOVE_ITEM TEST_OPS test_dygraph_sharding_stage3)
     list(REMOVE_ITEM TEST_OPS test_auto_parallel_parallelizer)
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mp_layers)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -1058,6 +1060,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_optimizer_stage2 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_dygraph_sharding_stage2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_dygraph_sharding_stage3 PROPERTIES TIMEOUT 120)
     set_tests_properties(test_auto_parallel_parallelizer PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
     set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..5b0bec9c454b0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py
@@ -0,0 +1,233 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+
+epoch = 10
+batch_size = 32
+paddle.seed(2021)
+np.random.seed(2021)
+base_lr = 0.1
+momentum_rate = 0.9
+l2_decay = 1e-4
+fleet.init(is_collective=True)
+
+
+class MLP(fluid.Layer):
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(
+        parameters=[{
+            "params": model.parameters()
+        }] if opt_group else model.parameters(),
+        learning_rate=0.001,
+        weight_decay=0.00001,
+        grad_clip=clip,
+        multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              use_pure_fp16=False,
+              accumulate_grad=False,
+              opt_group=False,
+              recompute=False):
+    group = paddle.distributed.new_group([0, 1])
+    if opt_group:
+        optimizer = optimizer_setting(
+            model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group)
+    else:
+        optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if use_pure_fp16:
+        model = paddle.amp.decorate(
+            models=model, level='O2', save_dtype='float32')
+        scaler = paddle.amp.GradScaler(init_loss_scaling=32768)
+        scaler = ShardingScaler(scaler)
+    if sharding_stage == 2:
+        optimizer = ShardingOptimizerStage2(
+            params=model.parameters(), optim=optimizer, group=group)
+        model = ShardingStage2(
+            model,
+            optimizer,
+            group=group,
+            buffer_max_size=2**21,
+            accumulate_grads=accumulate_grad)
+    elif sharding_stage == 3:
+        model = ShardingStage3(
+            model, optimizer=optimizer, group=group, sync_comm=recompute)
+
+    train_reader = paddle.batch(
+        reader_decorator(), batch_size=batch_size, drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(
+        capacity=32,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=True,
+        use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    for eop in range(epoch):
+        model.train()
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+            with paddle.amp.auto_cast(True, level='O2'):
+                out = model(img)
+                loss = paddle.nn.functional.cross_entropy(
+                    input=out, label=label)
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if not accumulate_grad:
+                if not use_pure_fp16:
+                    avg_loss.backward()
+                    optimizer.step()
+                else:
+                    scaler.scale(avg_loss).backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                optimizer.clear_grad()
+        if accumulate_grad:
+            if not use_pure_fp16:
+                avg_loss.backward()
+                optimizer.step()
+            else:
+                scaler.scale(avg_loss).backward()
+                scaler.step(optimizer)
+                scaler.update()
+            optimizer.clear_grad()
+    if sharding_stage == 3:
+        model.get_all_parameters()
+    return model.parameters()
+
+
+def test_stage2_stage3():
+    mlp, mlp1, mlp2, mlp3, mlp4, mlp5, mlp6, mlp7, mlp8 = MLP(), MLP(), MLP(
+    ), MLP(), MLP(), MLP(), MLP(), MLP(), MLP()
+    state_dict = mlp.state_dict()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+    mlp7.set_state_dict(state_dict)
+    mlp8.set_state_dict(state_dict)
+    # fp32 
+    stage2_params = train_mlp(
+        mlp1, sharding_stage=2, use_pure_fp16=False, opt_group=True)
+    stage3_params = train_mlp(
+        mlp2, sharding_stage=3, use_pure_fp16=False, opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp32 accumulate grad
+    stage2_params = train_mlp(
+        mlp3,
+        sharding_stage=2,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    stage3_params = train_mlp(
+        mlp4,
+        sharding_stage=3,
+        use_pure_fp16=False,
+        accumulate_grad=True,
+        opt_group=True)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16
+    stage2_params = train_mlp(
+        mlp5, sharding_stage=2, use_pure_fp16=True, opt_group=False)
+    stage3_params = train_mlp(
+        mlp6, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    for i in range(len(stage2_params)):
+        for j in range(len(stage3_params)):
+            if stage2_params[i].name == stage3_params[j].name:
+                np.testing.assert_allclose(
+                    stage2_params[i].numpy(),
+                    stage3_params[j].numpy(),
+                    rtol=1e-6)
+    # fp16 recompute
+    stage3_params = train_mlp(
+        mlp7, sharding_stage=3, use_pure_fp16=True, opt_group=False)
+    stage3_params_re = train_mlp(
+        mlp8,
+        sharding_stage=3,
+        use_pure_fp16=True,
+        opt_group=False,
+        recompute=True)
+    for i in range(len(stage3_params)):
+        for j in range(len(stage3_params_re)):
+            if stage3_params[i].name == stage3_params_re[j].name:
+                np.testing.assert_allclose(
+                    stage3_params[i].numpy(),
+                    stage3_params_re[j].numpy(),
+                    rtol=1e-6)
+    return
+
+
+if __name__ == '__main__':
+    test_stage2_stage3()
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
new file mode 100644
index 0000000000000..89d5f2e8c7b29
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage3.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestDygraphShardingStage3(TestMultipleGpus):
+
+    # check sharding logic as well as the accuracy with single mode
+    def test_dygraph_sharding_optimizer_stage3(self):
+        self.run_mnist_2gpu('dygraph_sharding_stage3.py')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0de8a805a89eb70203163a34858ff504afff30df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?=
 <78149749+winter-wang@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:05:00 +0800
Subject: [PATCH 02/10] [infrt] update the version of llvm. test=develop
 (#38843)

---
 cmake/external/llvm.cmake                     |  13 +-
 paddle/infrt/CMakeLists.txt                   |   1 -
 paddle/infrt/common/global.h                  |   2 +-
 paddle/infrt/dialect/CMakeLists.txt           |   6 +-
 paddle/infrt/dialect/basic_kernels.cc         |  22 +--
 paddle/infrt/dialect/basic_kernels.h          |   5 +-
 paddle/infrt/dialect/basic_kernels.td         |   7 +-
 paddle/infrt/dialect/dense_tensor.cc          | 148 +++++-------------
 paddle/infrt/dialect/dense_tensor.h           |  51 ++++--
 paddle/infrt/dialect/diagnostic_utils.cc      |   7 +-
 paddle/infrt/dialect/diagnostic_utils.h       |   6 +-
 paddle/infrt/dialect/dialect.cc               |  16 +-
 paddle/infrt/dialect/infrt_base.cc            |   6 +-
 paddle/infrt/dialect/infrt_base.h             |  32 ++--
 paddle/infrt/dialect/infrt_base.td            |   6 +-
 paddle/infrt/dialect/init_infrt_dialects.cc   |  12 +-
 paddle/infrt/dialect/init_infrt_dialects.h    |   8 +-
 paddle/infrt/dialect/mlir_loader.cc           |  18 ++-
 paddle/infrt/dialect/mlir_loader.h            |   9 +-
 paddle/infrt/dialect/mlir_loader_test.cc      |  11 +-
 paddle/infrt/dialect/mlir_tests/rewrite.mlir  |   2 +-
 .../dialect/mlir_tests/rewrite_conv_bn.mlir   |   2 +-
 paddle/infrt/dialect/mlir_tests/trt_ops.mlir  |   2 +-
 paddle/infrt/dialect/ops.td                   |   6 -
 paddle/infrt/dialect/opt.cc                   |  26 +--
 paddle/infrt/dialect/pd_op_base.td            |   2 +-
 paddle/infrt/dialect/pd_ops.cc                |  29 ++--
 paddle/infrt/dialect/pd_ops.h                 |  36 ++---
 paddle/infrt/dialect/pd_ops.td                |  14 +-
 paddle/infrt/dialect/pd_types.h               |  11 +-
 paddle/infrt/dialect/print_ir.cc              |  45 +++---
 paddle/infrt/dialect/tensor_shape.cc          |  16 +-
 paddle/infrt/dialect/tensor_shape.h           |   8 +-
 paddle/infrt/dialect/tensor_shape_base.td     |   4 +-
 paddle/infrt/dialect/tensorrt/trt_exec.cc     |   4 +-
 .../dialect/tensorrt/trt_graph_fuse_pass.cc   |  78 +++++----
 .../dialect/tensorrt/trt_graph_fuse_pass.h    |  12 +-
 .../dialect/tensorrt/trt_graph_split_pass.cc  |  20 +--
 .../dialect/tensorrt/trt_graph_split_pass.h   |  10 +-
 .../dialect/tensorrt/trt_op_teller_pass.cc    |  25 ++-
 .../dialect/tensorrt/trt_op_teller_pass.h     |  14 +-
 paddle/infrt/dialect/tensorrt/trt_ops.cc      |  22 ++-
 paddle/infrt/dialect/tensorrt/trt_ops.h       |  41 +++--
 paddle/infrt/dialect/test_kernels.cc          |  75 ++++-----
 paddle/infrt/dialect/test_kernels.h           |   7 +-
 paddle/infrt/dialect/types.cc                 |  17 --
 paddle/infrt/dialect/types.h                  |  16 --
 paddle/infrt/host_context/core_runtime.cc     |   6 +-
 paddle/infrt/host_context/core_runtime.h      |   6 +-
 paddle/infrt/host_context/kernel_frame.h      |   6 +-
 .../host_context/kernel_registry_test.cc      |   6 +-
 .../infrt/host_context/kernel_utils_test.cc   |   6 +-
 .../host_context/mlir_function_executable.cc  |   1 +
 .../host_context/mlir_function_executable.h   |   3 +-
 .../host_context/mlir_program_executor.h      |   4 +-
 .../host_context/mlir_to_runtime_translate.cc |  90 ++++++-----
 .../host_context/mlir_to_runtime_translate.h  |   8 +-
 .../mlir_to_runtime_translate_test.cc         |  12 +-
 paddle/infrt/host_context/op_executable.cc    |   7 +-
 paddle/infrt/host_context/op_executable.h     |  12 +-
 paddle/infrt/kernel/basic_kernels.cc          |   6 +-
 paddle/infrt/kernel/basic_kernels.h           |  12 +-
 paddle/infrt/kernel/tensor_kernels.cc         |   6 +-
 paddle/infrt/kernel/tensor_kernels.h          |  12 +-
 paddle/infrt/kernel/tensor_shape_kernels.cc   |   6 +-
 paddle/infrt/kernel/tensor_shape_kernels.h    |  12 +-
 paddle/infrt/kernel/test_kernels.cc           |   6 +-
 paddle/infrt/kernel/test_kernels.h            |  12 +-
 paddle/infrt/paddle/cpp/desc_api.h            |   8 +-
 paddle/infrt/paddle/model_parser.cc           |   6 +-
 paddle/infrt/paddle/model_parser.h            |   6 +-
 paddle/infrt/paddle/pb/block_desc.cc          |   8 +-
 paddle/infrt/paddle/pb/block_desc.h           |   8 +-
 paddle/infrt/paddle/pb/op_desc.cc             |   8 +-
 paddle/infrt/paddle/pb/op_desc.h              |   8 +-
 paddle/infrt/paddle/pb/program_desc.cc        |   8 +-
 paddle/infrt/paddle/pb/program_desc.h         |   8 +-
 paddle/infrt/paddle/pb/var_desc.cc            |   8 +-
 paddle/infrt/paddle/pb/var_desc.h             |   8 +-
 79 files changed, 616 insertions(+), 637 deletions(-)
 delete mode 100644 paddle/infrt/dialect/ops.td
 delete mode 100644 paddle/infrt/dialect/types.cc
 delete mode 100644 paddle/infrt/dialect/types.h

diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake
index e080a7359af98..27210e5260048 100644
--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
@@ -1,7 +1,7 @@
 include(FetchContent)
 
-set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
-set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz)
+set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
 
 set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
 set(FETCHCONTENT_QUIET OFF)
@@ -51,7 +51,7 @@ message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
 # To build with MLIR, the LLVM is build from source code using the following flags:
 
 #[==[
-cmake -G Ninja ../llvm \
+cmake ../llvm  -G "Unix Makefiles" \
   -DLLVM_ENABLE_PROJECTS="mlir;clang" \
   -DLLVM_BUILD_EXAMPLES=OFF \
   -DLLVM_TARGETS_TO_BUILD="X86" \
@@ -59,8 +59,10 @@ cmake -G Ninja ../llvm \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DLLVM_ENABLE_ZLIB=OFF \
   -DLLVM_ENABLE_RTTI=ON \
+  -DLLVM_INSTALL_UTILS=ON \
+  -DCMAKE_INSTALL_PREFIX=./install
 #]==]
-# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
 
 add_definitions(${LLVM_DEFINITIONS})
 
@@ -75,7 +77,7 @@ add_definitions(${LLVM_DEFINITIONS})
 
 
 # The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRStandardOps MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
+set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
 
 
 # tb_base is the name of a xxx.td file (without the .td suffix)
@@ -89,6 +91,7 @@ function(mlir_tablegen_on td_base)
   mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
   if (mlir_tablegen_on_DIALECT)
     mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls -dialect=${mlir_tablegen_on_DIALECT})
+    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs -dialect=${mlir_tablegen_on_DIALECT})
   endif()
   add_public_tablegen_target(${td_base}_IncGen)
   add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 8f05d286bf033..8af3012a220ad 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -77,7 +77,6 @@ add_subdirectory(paddle)
 
 # MLIR td file generations
 set(infrt_mlir_incs
-        ops_inc
         basic_kernels_inc
         test_kernels_inc
         infrt_base_inc
diff --git a/paddle/infrt/common/global.h b/paddle/infrt/common/global.h
index f89164d03f31d..e6586cb3a3c60 100644
--- a/paddle/infrt/common/global.h
+++ b/paddle/infrt/common/global.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "mlir/IR/MLIRContext.h"
+#include <mlir/IR/MLIRContext.h>
 #include "paddle/infrt/tensor/dense_host_tensor.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/CMakeLists.txt b/paddle/infrt/dialect/CMakeLists.txt
index d145843684c63..c064b2145266b 100644
--- a/paddle/infrt/dialect/CMakeLists.txt
+++ b/paddle/infrt/dialect/CMakeLists.txt
@@ -2,7 +2,6 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     dialect.cc
-    types.cc
     basic_kernels.cc
     test_kernels.cc
     infrt_base.cc
@@ -14,8 +13,6 @@ gather_srcs(infrt_src SRCS
     pd_types.cc
     pd_ops.cc
     )
-
-mlir_tablegen_on(ops)
 mlir_tablegen_on(basic_kernels)
 mlir_tablegen_on(test_kernels)
 mlir_tablegen_on(infrt_base DIALECT infrt)
@@ -27,8 +24,7 @@ mlir_add_rewriter(rewrite)
 
 # TODO(Superjomn) add a cmake function cc_executable to ecapsulate the following code
 add_executable(infrtopt opt.cc)
-target_link_libraries(infrtopt infrt ${mlir_libs})
-add_dependencies(infrtopt infrt)
+target_link_libraries(infrtopt infrt)
 
 add_executable(print-ir print_ir.cc)
 target_link_libraries(print-ir infrt ${mlir_libs})
diff --git a/paddle/infrt/dialect/basic_kernels.cc b/paddle/infrt/dialect/basic_kernels.cc
index b4d2b9182b0c5..bad7e73ec5ae5 100644
--- a/paddle/infrt/dialect/basic_kernels.cc
+++ b/paddle/infrt/dialect/basic_kernels.cc
@@ -17,17 +17,17 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
 #include "paddle/infrt/dialect/dense_tensor.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 using namespace mlir;  // NOLINT
 
 static ParseResult parseCallOp(OpAsmParser &parser,       // NOLINT
@@ -71,12 +71,12 @@ static ParseResult parseConstantF64Op(OpAsmParser &parser,       // NOLINT
 static ParseResult parseConstantI32Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(32, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 32), parser, result);
 }
 static ParseResult parseConstantI64Op(OpAsmParser &parser,       // NOLINT
                                       OperationState &result) {  // NOLINT
   return parseConstantOp(
-      IntegerType::get(64, result.getContext()), parser, result);
+      IntegerType::get(result.getContext(), 64), parser, result);
 }
 
 static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
@@ -90,10 +90,10 @@ static ParseResult parseReturnOp(OpAsmParser &parser,       // NOLINT
 }
 
 static void print(OpAsmPrinter &p, CallOp op) {  // NOLINT
-  p << "infrt.call " << op.getAttr("callee") << "(";
+  p << "infrt.call " << op->getAttr("callee") << "(";
   p.printOperands(op.getOperands());
   p << ")";
-  p.printOptionalAttrDict(op.getAttrs(), {"callee"});
+  p.printOptionalAttrDict(op->getAttrs(), {"callee"});
   p << " : ";
 }
 
@@ -145,7 +145,7 @@ static LogicalResult verify(ConstantF64Op op) { return success(); }
 static LogicalResult verify(ConstantI64Op op) { return success(); }
 
 static LogicalResult verify(ReturnOp op) {
-  auto function = dyn_cast<FuncOp>(op.getParentOp());
+  auto function = dyn_cast<FuncOp>(op->getParentOp());
 
   if (!function) return success();
 
@@ -157,8 +157,8 @@ static LogicalResult verify(ReturnOp op) {
 
   return success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.h b/paddle/infrt/dialect/basic_kernels.h
index 65316bc1437c0..b82abcd52d28f 100644
--- a/paddle/infrt/dialect/basic_kernels.h
+++ b/paddle/infrt/dialect/basic_kernels.h
@@ -13,12 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-using namespace mlir;  // NOLINT
-
-namespace infrt::dialect {
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/basic_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/basic_kernels.td b/paddle/infrt/dialect/basic_kernels.td
index df5e4d8a2c6a1..7d8de79fbae2b 100644
--- a/paddle/infrt/dialect/basic_kernels.td
+++ b/paddle/infrt/dialect/basic_kernels.td
@@ -27,7 +27,7 @@ def CallOp : INFRT_Op<"call"> {
   let results = (outs Variadic<AnyType>);
 
   let extraClassDeclaration = [{
-      StringRef getCallee() { return callee(); }
+      mlir::StringRef getCallee() { return callee(); }
       mlir::FunctionType getCalleeType();
     }];
 }
@@ -57,9 +57,8 @@ def ReturnOp : INFRT_Op<"return", [Terminator]> {
 
   let arguments = (ins Variadic<AnyType>:$operands);
 
-  let builders = [OpBuilder<
-                  "OpBuilder &b, OperationState &result",
-                  [{ build(b, result, llvm::None); }]>];
+  let builders = [OpBuilder<(ins),
+                  [{ build($_builder, $_state, llvm::None); }]>];
 }
 
 class AddOp<string suffix, Type type> : INFRT_Op<"add." # suffix, [NoSideEffect]> {
diff --git a/paddle/infrt/dialect/dense_tensor.cc b/paddle/infrt/dialect/dense_tensor.cc
index 629a7b16523fc..7685cdc65b9ad 100644
--- a/paddle/infrt/dialect/dense_tensor.cc
+++ b/paddle/infrt/dialect/dense_tensor.cc
@@ -17,12 +17,11 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
@@ -31,68 +30,37 @@
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
-namespace infrt::dt {
-
+namespace infrt {
+namespace dt {
 void DTDialect::initialize() {
-  allowUnknownTypes();
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"
       >();
 }
 
-namespace detail {
-struct TensorTypeStorage : public mlir::TypeStorage {
-  TensorTypeStorage(TargetType target,
-                    LayoutType layout,
-                    PrecisionType precision)
-      : target_(target), layout_(layout), precision_(precision) {}
-
-  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
-
-  bool operator==(const KeyTy &key) const {
-    return key == KeyTy(target_, layout_, precision_);
-  }
-
-  static llvm::hash_code hashKey(const KeyTy &key) {
-    return llvm::hash_value(key);
-  }
-
-  static TensorTypeStorage *construct(
-      mlir::TypeStorageAllocator &allocator,  // NOLINT
-      const KeyTy &key) {
-    return new (allocator.allocate<TensorTypeStorage>())
-        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
-  }
-
-  TargetType target_;
-  LayoutType layout_;
-  PrecisionType precision_;
-};
-}  // namespace detail
-
 llvm::Optional<TargetType> GetTargetType(mlir::StringRef key) {
-  if (key.equals_lower("x86"))
+  if (key.equals_insensitive("x86"))
     return TargetType::X86;
-  else if (key.equals_lower("cuda"))
+  else if (key.equals_insensitive("cuda"))
     return TargetType::CUDA;
   else
     return llvm::None;
 }
 
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key) {
-  if (key.equals_lower("nchw"))
+  if (key.equals_insensitive("nchw"))
     return LayoutType::NCHW;
-  else if (key.equals_lower("nhwc"))
+  else if (key.equals_insensitive("nhwc"))
     return LayoutType::NHWC;
   else
     return llvm::None;
 }
 
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key) {
-  if (key.equals_lower("i32"))
+  if (key.equals_insensitive("i32"))
     return PrecisionType::I32;
-  else if (key.equals_lower("f32"))
+  else if (key.equals_insensitive("f32"))
     return PrecisionType::F32;
   else
     return llvm::None;
@@ -111,7 +79,7 @@ LayoutType TensorType::layout() { return getImpl()->layout_; }
 
 PrecisionType TensorType::precision() { return getImpl()->precision_; }
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType) {
   os << "TensorType<" << tensorType.target() << ", " << tensorType.layout()
      << ", " << tensorType.precision() << ">";
   return os;
@@ -133,7 +101,7 @@ StringType StringType::get(mlir::MLIRContext *context) {
   return Base::get(context);
 }
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type) {
   switch (type) {
     case (TargetType::X86):
       os << "X86";
@@ -147,7 +115,7 @@ raw_ostream &operator<<(raw_ostream &os, TargetType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type) {
   switch (type) {
     case (LayoutType::NCHW):
       os << "NCHW";
@@ -161,7 +129,7 @@ raw_ostream &operator<<(raw_ostream &os, LayoutType type) {
   return os;
 }
 
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type) {
   switch (type) {
     case (PrecisionType::I32):
       os << "I32";
@@ -175,103 +143,69 @@ raw_ostream &operator<<(raw_ostream &os, PrecisionType type) {
   return os;
 }
 
-static Type getTensorType(mlir::MLIRContext *context) {
-  auto t_dialect = Identifier::get("t", context);
-  return OpaqueType::get(t_dialect, "tensor", context);
+static mlir::Type getTensorType(mlir::MLIRContext *context) {
+  auto t_dialect = mlir::Identifier::get("t", context);
+  return mlir::OpaqueType::get(t_dialect, "tensor");
 }
 
-static ParseResult parseCreateUninitTensorOp(
-    OpAsmParser &parser,       // NOLINT
-    OperationState &result) {  // NOLINT
+static mlir::ParseResult parseCreateUninitTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
   auto loc = parser.getCurrentLocation();
-  ::mlir::Type outputRawTypes[1];
-  ::llvm::ArrayRef<::mlir::Type> outputTypes(outputRawTypes);
+  mlir::Type outputRawTypes[1];
+  ::llvm::ArrayRef<mlir::Type> outputTypes(outputRawTypes);
 
   mlir::ArrayAttr shapeAttr;
   if (parser.parseAttribute(shapeAttr,
                             parser.getBuilder().getI64Type(),
                             "shape",
                             result.attributes))
-    return failure();
-  if (parser.parseOptionalAttrDict(result.attributes)) return failure();
+    return mlir::failure();
+  if (parser.parseOptionalAttrDict(result.attributes)) return mlir::failure();
 
-  if (parser.parseArrow()) return failure();
-  if (parser.parseType(outputRawTypes[0])) return failure();
+  if (parser.parseArrow()) return mlir::failure();
+  if (parser.parseType(outputRawTypes[0])) return mlir::failure();
   if (!outputRawTypes[0].isa<TensorType>())
     return parser.emitError(loc, "invalid kind of type specified");
   result.addTypes(outputTypes);
-  return success();
+  return mlir::success();
 }
 
 template <typename CreateUninitTensorOp>
-static void printCreateUninitTensorOp(OpAsmPrinter &p,  // NOLINT
+static void printCreateUninitTensorOp(mlir::OpAsmPrinter &p,  // NOLINT
                                       CreateUninitTensorOp op) {
   p << CreateUninitTensorOp::getOperationName();
   p << " ";
   p.printAttributeWithoutType(op.shapeAttr());
-  p.printOptionalAttrDict(op.getAttrs(), /*elidedAttrs=*/{"shape"});
+  p.printOptionalAttrDict(op->getAttrs(), /*elidedAttrs=*/{"shape"});
   p << " -> ";
   p << op.getOperation()->getResultTypes();
 }
 
-// TODO(shibo): can be removed?
-// static ParseResult parseFillTensorWithConstantOp(OpAsmParser& parser,
-// OperationState& result) {
-//  auto loc = parser.getCurrentLocation();
-//  ::mlir::OpAsmParser::OperandType inputRawOperands[1];
-//  ::llvm::ArrayRef<::mlir::OpAsmParser::OperandType>
-//  inputOperands(inputRawOperands);
-//  ::mlir::Type inputRawTypes[1];
-//  ::llvm::ArrayRef<::mlir::Type> inputTypes(inputRawTypes);
-//
-//  if (parser.parseOperand(inputRawOperands[0])) return failure();
-//
-//  if (parser.parseColon()) return failure();
-//  if (parser.parseType(inputRawTypes[0])) return failure();
-//  if (!inputRawTypes[0].isa<TensorType>())
-//    return parser.emitError(loc, "invalid kind of type specified");
-//
-//  Attribute value_attr;
-//  if (parser.resolveOperands(inputOperands, inputTypes, loc, result.operands))
-//  return failure();
-//  if (parser.parseAttribute(value_attr, "value", result.attributes)) return
-//  failure();
-//  return success();
-//}
-
-// TODO(shibo): can be removed?
-// template <typename FillTensorOp>
-// static void printFillTensorWithConstantOp(OpAsmPrinter& p, FillTensorOp op) {
-//  p << FillTensorOp::getOperationName();
-//  p << " ";
-//  p.printOperand(op.getOperand());
-//  p << " : ";
-//  p << op.getOperation()->getOperandTypes();
-//  p << " ";
-//  p << op.getAttr("value");
-//}
-
-static ParseResult parseSetTensorOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  SmallVector<OpAsmParser::OperandType, 1> operands;
-  if (parser.parseOperandList(operands, 1)) return failure();
+static mlir::ParseResult parseSetTensorOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 1> operands;
+  if (parser.parseOperandList(operands, 1)) return mlir::failure();
 
   auto tensor_type = getTensorType(result.getContext());
 
-  Attribute value_attr;
-  return failure(
+  mlir::Attribute value_attr;
+  return mlir::failure(
       parser.resolveOperand(operands[0], tensor_type, result.operands) ||
       parser.parseAttribute(value_attr, "values", result.attributes));
 }
 
 template <typename SetTensorOp>
-static void printSetTensorOp(OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
+static void printSetTensorOp(mlir::OpAsmPrinter &p, SetTensorOp op) {  // NOLINT
   p << SetTensorOp::getOperationName() << " ";
   p.printOperand(op.getOperand());
-  p << " " << op.getAttr("values");
+  p << " " << op->getAttr("values");
 }
+}  // namespace dt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.cpp.inc"  // NOLINT
 
-}  // namespace infrt::dt
+#include "paddle/infrt/dialect/dense_tensor_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/dense_tensor.h b/paddle/infrt/dialect/dense_tensor.h
index 866c62213ab05..416925d3382ba 100644
--- a/paddle/infrt/dialect/dense_tensor.h
+++ b/paddle/infrt/dialect/dense_tensor.h
@@ -19,13 +19,8 @@
 
 #include <string>
 
-using namespace mlir;  // NOLINT
-namespace infrt::dt {
-
-namespace detail {
-struct TensorTypeStorage;
-}  // namespace detail
-
+namespace infrt {
+namespace dt {
 enum class TargetType : uint8_t { X86, CUDA };
 enum class LayoutType : uint8_t { NCHW, NHWC };
 enum class PrecisionType : uint8_t { I32, F32 };
@@ -34,9 +29,39 @@ llvm::Optional<TargetType> GetTargetType(mlir::StringRef key);
 llvm::Optional<LayoutType> GetLayoutType(mlir::StringRef key);
 llvm::Optional<PrecisionType> GetPrecisionType(mlir::StringRef key);
 
-raw_ostream &operator<<(raw_ostream &os, TargetType type);
-raw_ostream &operator<<(raw_ostream &os, LayoutType type);
-raw_ostream &operator<<(raw_ostream &os, PrecisionType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TargetType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, LayoutType type);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, PrecisionType type);
+
+namespace detail {
+struct TensorTypeStorage : public mlir::TypeStorage {
+  TensorTypeStorage(TargetType target,
+                    LayoutType layout,
+                    PrecisionType precision)
+      : target_(target), layout_(layout), precision_(precision) {}
+
+  using KeyTy = std::tuple<TargetType, LayoutType, PrecisionType>;
+
+  bool operator==(const KeyTy &key) const {
+    return key == KeyTy(target_, layout_, precision_);
+  }
+
+  static llvm::hash_code hashKey(const KeyTy &key) {
+    return llvm::hash_value(key);
+  }
+
+  static TensorTypeStorage *construct(
+      mlir::TypeStorageAllocator &allocator,  // NOLINT
+      const KeyTy &key) {
+    return new (allocator.allocate<TensorTypeStorage>())
+        TensorTypeStorage(std::get<0>(key), std::get<1>(key), std::get<2>(key));
+  }
+
+  TargetType target_;
+  LayoutType layout_;
+  PrecisionType precision_;
+};
+}  // namespace detail
 
 class TensorType : public mlir::Type::TypeBase<TensorType,
                                                mlir::Type,
@@ -52,7 +77,7 @@ class TensorType : public mlir::Type::TypeBase<TensorType,
   PrecisionType precision();
 };
 
-raw_ostream &operator<<(raw_ostream &os, TensorType tensorType);
+mlir::raw_ostream &operator<<(mlir::raw_ostream &os, TensorType tensorType);
 
 class TensorMapType : public mlir::Type::TypeBase<TensorMapType,
                                                   mlir::Type,
@@ -70,10 +95,10 @@ class StringType
   static StringType get();
   static StringType get(mlir::MLIRContext *context);
 };
+}  // namespace dt
+}  // namespace infrt
 
 #include "paddle/infrt/dialect/dense_tensor_dialect.hpp.inc"
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/dense_tensor.hpp.inc"
-
-}  // namespace infrt::dt
diff --git a/paddle/infrt/dialect/diagnostic_utils.cc b/paddle/infrt/dialect/diagnostic_utils.cc
index a28176e38fdc7..4151001067ecb 100644
--- a/paddle/infrt/dialect/diagnostic_utils.cc
+++ b/paddle/infrt/dialect/diagnostic_utils.cc
@@ -14,9 +14,11 @@
 
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 
+#include <llvm/Support/raw_ostream.h>
 #include <string>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 struct MyScopedDiagnosicHandler::Impl {
   Impl() : diag_stream_(diag_str_) {}
@@ -49,4 +51,5 @@ mlir::LogicalResult MyScopedDiagnosicHandler::handler(mlir::Diagnostic *diag) {
   return mlir::failure(true);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/diagnostic_utils.h b/paddle/infrt/dialect/diagnostic_utils.h
index 3a8098cf75181..746e61c8fe5c3 100644
--- a/paddle/infrt/dialect/diagnostic_utils.h
+++ b/paddle/infrt/dialect/diagnostic_utils.h
@@ -18,7 +18,8 @@
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 /**
  * A scoped diagnostic handler to help debug MLIR process.
@@ -36,4 +37,5 @@ class MyScopedDiagnosicHandler : public mlir::SourceMgrDiagnosticHandler {
   std::unique_ptr<Impl> impl_;
 };
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/dialect.cc b/paddle/infrt/dialect/dialect.cc
index cbcd5d0f0fa78..fe07b91d22ed5 100644
--- a/paddle/infrt/dialect/dialect.cc
+++ b/paddle/infrt/dialect/dialect.cc
@@ -13,24 +13,26 @@
 // limitations under the License.
 
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Dialect.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::hlir::dialect {
+namespace infrt {
+namespace hlir {
+namespace dialect {
 
-class CinnDialect : public ::mlir::Dialect {
+class CinnDialect : public mlir::Dialect {
  public:
-  explicit CinnDialect(::mlir::MLIRContext* ctx);
+  explicit CinnDialect(mlir::MLIRContext* ctx);
 
   //! We should register this function in dialect
   static llvm::StringRef getDialectNamespace() {
     return "infrt::hlir::dialect";
   }
 };
-
-}  // namespace infrt::hlir::dialect
+}  // namespace dialect
+}  // namespace hlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.cc b/paddle/infrt/dialect/infrt_base.cc
index b28ad5ad4b5a5..e8005661bbd65 100644
--- a/paddle/infrt/dialect/infrt_base.cc
+++ b/paddle/infrt/dialect/infrt_base.cc
@@ -18,7 +18,8 @@
 #include "paddle/infrt/dialect/dense_tensor.h"
 #include "paddle/infrt/dialect/test_kernels.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 // ----INFRTDialect definition begin----
 void INFRTDialect::initialize() {
@@ -124,4 +125,5 @@ void INFRTDialect::printType(mlir::Type type,
 
 // ----INFRTDialect definition end----
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h
index 58acd7c9a409a..1a7fbcf395a6e 100644
--- a/paddle/infrt/dialect/infrt_base.h
+++ b/paddle/infrt/dialect/infrt_base.h
@@ -18,19 +18,17 @@
 #include <mlir/IR/Dialect.h>
 #include <mlir/IR/DialectImplementation.h>
 #include <mlir/IR/MLIRContext.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 
 #include "paddle/infrt/dialect/infrt_base.hpp.inc"
 
-namespace infrt::dialect {
-
-class INFRTDialect : public ::mlir::Dialect {
-  explicit INFRTDialect(::mlir::MLIRContext *context)
-      : ::mlir::Dialect(getDialectNamespace(),
-                        context,
-                        ::mlir::TypeID::get<INFRTDialect>()) {
+namespace infrt {
+namespace dialect {
+class INFRTDialect : public mlir::Dialect {
+  explicit INFRTDialect(mlir::MLIRContext *context)
+      : mlir::Dialect(
+            getDialectNamespace(), context, mlir::TypeID::get<INFRTDialect>()) {
     initialize();
   }
 
@@ -41,15 +39,12 @@ class INFRTDialect : public ::mlir::Dialect {
                  mlir::DialectAsmPrinter &printer) const override;
 
   void initialize();
-  friend class ::mlir::MLIRContext;
+  friend class mlir::MLIRContext;
 
  public:
   static ::llvm::StringRef getDialectNamespace() { return "infrt"; }
 };
-
-}  // namespace infrt::dialect
-
-namespace mlir {
+}  // namespace dialect
 
 template <typename T>
 static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
@@ -58,17 +53,16 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b,  // NOLINT
   return b.getIntegerAttr(b.getI32Type(), constant);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> cvtValueToValueRange(
+static mlir::SmallVector<mlir::Value, 4> cvtValueToValueRange(
     const mlir::Value &operand) {
-  return mlir::SmallVector<::mlir::Value, 4>(1, operand);
+  return mlir::SmallVector<mlir::Value, 4>(1, operand);
 }
 
-static mlir::SmallVector<::mlir::Value, 4> concatTwoValueRange(
+static mlir::SmallVector<mlir::Value, 4> concatTwoValueRange(
     mlir::ValueRange operand_0, mlir::ValueRange operand_1) {
-  mlir::SmallVector<::mlir::Value, 4> operands;
+  mlir::SmallVector<mlir::Value, 4> operands;
   operands.append(operand_0.begin(), operand_0.end());
   operands.append(operand_1.begin(), operand_1.end());
   return operands;
 }
-
-}  // namespace mlir
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td
index 7d6fdbbbf2f68..1abd294236d93 100644
--- a/paddle/infrt/dialect/infrt_base.td
+++ b/paddle/infrt/dialect/infrt_base.td
@@ -28,11 +28,11 @@ def TensorMapType :
 def BufferType : OpaqueType<"b", "buffer", "buffer">;
 
 class INFRT_createI32Attr<string value> : NativeCodeCall<
-    "mlir::createI32Attr($_builder, $_loc, " # value # ")">;
+    "infrt::createI32Attr($_builder, $_loc, " # value # ")">;
 
 def INFRT_cvtValueToValueRange : NativeCodeCall<
-    "mlir::cvtValueToValueRange($0)">;
+    "infrt::cvtValueToValueRange($0)">;
 
 def INFRT_concatTwoValueRange : NativeCodeCall<
-    "mlir::concatTwoValueRange($0, $1)">;
+    "infrt::concatTwoValueRange($0, $1)">;
 #endif  // INFRT_BASE
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index 4bc2bf70942d2..c3769414dbb39 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -23,12 +23,10 @@
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry) {  // NOLINT
-  registry.insert<ts::TensorShapeDialect>();
-  registry.insert<dialect::INFRTDialect>();
-  registry.insert<dt::DTDialect>();
-  registry.insert<mlir::pd::PaddleDialect>();
+void registerCinnDialects(mlir::DialectRegistry &registry) {  // NOLINT
+  registry.insert<ts::TensorShapeDialect,
+                  dialect::INFRTDialect,
+                  dt::DTDialect,
+                  mlir::pd::PaddleDialect>();
 }
-
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/init_infrt_dialects.h b/paddle/infrt/dialect/init_infrt_dialects.h
index 50caca018980d..0912e9ef2555b 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.h
+++ b/paddle/infrt/dialect/init_infrt_dialects.h
@@ -14,10 +14,8 @@
 
 #pragma once
 
-#include "mlir/IR/Dialect.h"
-
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/MLIRContext.h>
 namespace infrt {
-
-void RegisterCinnDialects(mlir::DialectRegistry& registry);  // NOLINT
-
+void registerCinnDialects(mlir::DialectRegistry &registry);  // NOLINT
 }  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.cc b/paddle/infrt/dialect/mlir_loader.cc
index b318a6a763483..1d0696e77dcda 100644
--- a/paddle/infrt/dialect/mlir_loader.cc
+++ b/paddle/infrt/dialect/mlir_loader.cc
@@ -16,8 +16,8 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 #include <unordered_map>
@@ -30,12 +30,15 @@
 #include "paddle/infrt/dialect/diagnostic_utils.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   // Currenetly, We only used the CinnDialect and mlir::BuiltinDialect is
   // enough。Don't need StandardOpsDialect.
   // context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
@@ -57,9 +60,9 @@ mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context) {
   // context->allowUnregisteredDialects();
-  RegisterCinnDialects(context->getDialectRegistry());
-  context->getDialectRegistry().insert<mlir::StandardOpsDialect>();
-
+  mlir::DialectRegistry registry;
+  registerCinnDialects(registry);
+  context->appendDialectRegistry(registry);
   mlir::ScopedDiagnosticHandler scope_handler(
       context, [](mlir::Diagnostic& diag) {
         if (diag.getSeverity() != mlir::DiagnosticSeverity::Error)
@@ -71,4 +74,5 @@ mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
   return mlir::parseSourceFile(std::string(file_name), context);
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader.h b/paddle/infrt/dialect/mlir_loader.h
index 092da7d9ce03f..5e50ad9e5a271 100644
--- a/paddle/infrt/dialect/mlir_loader.h
+++ b/paddle/infrt/dialect/mlir_loader.h
@@ -15,16 +15,17 @@
 #pragma once
 
 #include <glog/logging.h>
-#include <mlir/IR/Module.h>
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include <memory>
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 mlir::OwningModuleRef LoadMlirSource(mlir::MLIRContext* context,
                                      const std::string& mlir_source);
 mlir::OwningModuleRef LoadMlirFile(const std::string& file_name,
                                    mlir::MLIRContext* context);
-
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_loader_test.cc b/paddle/infrt/dialect/mlir_loader_test.cc
index 1b622d585ad8e..1115053073044 100644
--- a/paddle/infrt/dialect/mlir_loader_test.cc
+++ b/paddle/infrt/dialect/mlir_loader_test.cc
@@ -17,14 +17,15 @@
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <llvm/Support/SourceMgr.h>
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/Parser.h>
 
 #include <string>
 
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
-namespace infrt::dialect {
+namespace infrt {
+namespace dialect {
 
 TEST(MlirLoader, basic) {
   mlir::MLIRContext context;
@@ -42,8 +43,7 @@ func @main() -> f32 {
 )ROC";
 
   auto module = LoadMlirSource(&context, source);
-  module->verify();
-
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
   LOG(INFO) << "module name: " << module->getOperationName().data();
   for (auto func : module->getOps<mlir::FuncOp>()) {
     LOG(INFO) << "get func " << func.getName().str();
@@ -54,4 +54,5 @@ func @main() -> f32 {
   }
 }
 
-}  // namespace infrt::dialect
+}  // namespace dialect
+}  // namespace infrt
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite.mlir b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
index bfad9d1f6924d..5e207634da8e4 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite.mlir
@@ -20,5 +20,5 @@ func @main() -> tensor<?xf32> {
   %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
-  infrt.return %e2 : tensor<?xf32>
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
index 9ea1ec0ebca36..2889b92b18ef0 100644
--- a/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
+++ b/paddle/infrt/dialect/mlir_tests/rewrite_conv_bn.mlir
@@ -11,5 +11,5 @@ func @main() -> tensor<?xf32> {
 
   %c = "pd.conv2d"(%a, %filter, %bias) {} : (tensor<?x3x256x256xf32>, tensor<3x64x3x3xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
   %d = "pd.batch_norm"(%c, %scale, %bias2, %mean, %var) {} : (tensor<?x3x256x256xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> tensor<?x3x256x256xf32>
-  infrt.return %d : tensor<?x3x256x256xf32>
+  "pd.fetch"(%d) {name="output"} :(tensor<?x3x256x256xf32>)->()
 }
\ No newline at end of file
diff --git a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
index 009b6d1c19653..d98f107bab41e 100644
--- a/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
+++ b/paddle/infrt/dialect/mlir_tests/trt_ops.mlir
@@ -18,5 +18,5 @@ func @main() -> tensor<?xf32> {
   %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:i32} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %e2 = "pd.relu"(%d2) {} : (tensor<?xf32>) -> tensor<?xf32>
   
-  "pd.fetch"(%e2) :(tensor<?xf32>)->()
+  "pd.fetch"(%e2) {name="output"} :(tensor<?xf32>)->()
 }
diff --git a/paddle/infrt/dialect/ops.td b/paddle/infrt/dialect/ops.td
deleted file mode 100644
index 264134a447c63..0000000000000
--- a/paddle/infrt/dialect/ops.td
+++ /dev/null
@@ -1,6 +0,0 @@
-include "mlir/IR/OpBase.td"
-include "paddle/infrt/dialect/infrt_base.td"
-
-
-class INFRT_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<INFRT_Dialect, mnemonic, traits>;
diff --git a/paddle/infrt/dialect/opt.cc b/paddle/infrt/dialect/opt.cc
index d90d25230d0c2..5bcf5a23f4c53 100644
--- a/paddle/infrt/dialect/opt.cc
+++ b/paddle/infrt/dialect/opt.cc
@@ -12,34 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <glog/logging.h>
-#include <llvm/Support/CommandLine.h>
-#include <mlir/Dialect/Affine/IR/AffineOps.h>
-#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
-#include <mlir/IR/AsmState.h>
-#include <mlir/IR/Dialect.h>
-#include <mlir/InitAllDialects.h>
-#include <mlir/InitAllPasses.h>
-#include <mlir/Pass/Pass.h>
-#include <mlir/Pass/PassManager.h>
-#include <mlir/Support/FileUtilities.h>
 #include <mlir/Support/MlirOptMain.h>
 #include <mlir/Transforms/Passes.h>
-
-#include <iostream>
-
-#include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
-#include "paddle/infrt/dialect/mlir_loader.h"
 
 int main(int argc, char **argv) {
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
   mlir::registerCanonicalizerPass();
-
   return mlir::failed(
-      mlir::MlirOptMain(argc, argv, "INFRT mlir pass driver", registry));
+      mlir::MlirOptMain(argc, argv, "infrt mlir pass driver", registry));
 }
diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td
index af53df113dfb3..a3e3c4ae59277 100644
--- a/paddle/infrt/dialect/pd_op_base.td
+++ b/paddle/infrt/dialect/pd_op_base.td
@@ -16,7 +16,7 @@ def PD_Dialect : Dialect {
     This dialect contains the PaddlePaddle operators.
   }];
 
-  let cppNamespace = "::mlir::pd";
+  let cppNamespace = "mlir::pd";
 }
 
 class PD_Op<string mnemonic, list<OpTrait> traits = []> :
diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc
index ce10be6d100f8..fe38996883846 100644
--- a/paddle/infrt/dialect/pd_ops.cc
+++ b/paddle/infrt/dialect/pd_ops.cc
@@ -14,10 +14,15 @@
 
 #include "paddle/infrt/dialect/pd_ops.h"
 
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/PatternMatch.h>
 #include "paddle/infrt/dialect/infrt_base.h"
 
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
+
+#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
+
 namespace mlir {
 namespace pd {
 PaddleDialect::PaddleDialect(MLIRContext *context)
@@ -36,12 +41,6 @@ mlir::Operation *PaddleDialect::materializeConstant(mlir::OpBuilder &builder,
   return builder.create<ConstantOp>(loc, value);
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
-#include "paddle/infrt/dialect/rewrite.hpp.inc"  // NOLINT
-
 void ConstantOp::build(OpBuilder &builder,
                        OperationState &state,
                        Attribute value) {
@@ -66,8 +65,8 @@ LogicalResult ConstantOp::inferReturnTypes(
   inferredReturnTypes.push_back(attributes.get("value").getType());
   return success();
 }
-::mlir::OpFoldResult ConstantOp::fold(
-    ::llvm::ArrayRef<::mlir::Attribute> operands) {
+mlir::OpFoldResult ConstantOp::fold(
+    ::llvm::ArrayRef<mlir::Attribute> operands) {
   return value();
 }
 
@@ -82,11 +81,11 @@ LogicalResult ElementwiseAdd::inferReturnTypes(
   return success();
 }
 void ElementwiseAdd::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseMulAdd>(context);
 }
 
-::mlir::OpFoldResult ElementwiseAdd::fold(
+mlir::OpFoldResult ElementwiseAdd::fold(
     llvm::ArrayRef<mlir::Attribute> operands) {
   if (getElementTypeOrSelf(getType()).isa<FloatType>()) {
     if (!operands[0] || !operands[1]) return {};
@@ -154,17 +153,17 @@ LogicalResult MulOp::inferReturnTypes(
 }
 
 void ReluOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseFCRelu>(context);
 }
 
 void FusedRepeatedFCRelu::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseRepeatedFCRelu2>(context);
 }
 
 void BatchNormOp::getCanonicalizationPatterns(
-    ::mlir::OwningRewritePatternList &results, ::mlir::MLIRContext *context) {
+    mlir::OwningRewritePatternList &results, mlir::MLIRContext *context) {
   results.insert<FuseBatchNormWithConvPattern>(context);
 }
 
diff --git a/paddle/infrt/dialect/pd_ops.h b/paddle/infrt/dialect/pd_ops.h
index 71e0a53988d1a..7d1d1d6f58451 100644
--- a/paddle/infrt/dialect/pd_ops.h
+++ b/paddle/infrt/dialect/pd_ops.h
@@ -14,21 +14,20 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace mlir {
 namespace pd {
@@ -53,9 +52,8 @@ class PaddleDialect : public Dialect {
   }
 };
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/pd_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
 }  // namespace pd
 }  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/pd_ops.hpp.inc"
diff --git a/paddle/infrt/dialect/pd_ops.td b/paddle/infrt/dialect/pd_ops.td
index b020b7ad5dbc7..3addf15082a12 100644
--- a/paddle/infrt/dialect/pd_ops.td
+++ b/paddle/infrt/dialect/pd_ops.td
@@ -24,6 +24,16 @@ def PD_FeedOp : PD_Op<"feed"> {
 def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let summary = "fetch Op";
 
+  let description = [{
+    Return the output tensor from the subgraph.
+  }];
+
+  let arguments = (ins PD_Tensor :$inputs, StrAttr:$name);
+}
+
+def PD_ReturnOp : PD_Op<"return", [Terminator]> {
+  let summary = "return Op";
+
   let description = [{
     Fetch tensor from the graph.
   }];
@@ -31,7 +41,7 @@ def PD_FetchOp : PD_Op<"fetch", [Terminator]> {
   let arguments = (ins Variadic<PD_Tensor>:$inputs);
 }
 
-def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> {
+def PD_GraphOp : PD_Op<"graph", [SingleBlockImplicitTerminator<"ReturnOp">]> {
   let summary = "paddle graph Op";
   let description = [{
     Describe a paddle graph or subgraph.
@@ -50,7 +60,7 @@ def PD_ConstantOp : PD_Op<"constant", [NoSideEffect, ConstantLike, DeclareOpInte
   let hasFolder = 1;
 
   let builders = [
-    OpBuilder<"OpBuilder &builder, OperationState &state, Attribute value">,
+    OpBuilder<(ins "Attribute":$value)>,
   ];
 }
 
diff --git a/paddle/infrt/dialect/pd_types.h b/paddle/infrt/dialect/pd_types.h
index 6f9fe56338a9f..0da888a9c0769 100644
--- a/paddle/infrt/dialect/pd_types.h
+++ b/paddle/infrt/dialect/pd_types.h
@@ -18,12 +18,11 @@
 
 #pragma once
 
-#include "mlir/IR/Diagnostics.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/IR/Types.h"
+#include <mlir/IR/Diagnostics.h>
+#include <mlir/IR/Location.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/IR/Types.h>
 
 namespace mlir {
 namespace PD {
diff --git a/paddle/infrt/dialect/print_ir.cc b/paddle/infrt/dialect/print_ir.cc
index 43a3577b90f10..5cfd16ee85943 100644
--- a/paddle/infrt/dialect/print_ir.cc
+++ b/paddle/infrt/dialect/print_ir.cc
@@ -11,26 +11,25 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#include <llvm/ADT/Optional.h>
+#include <llvm/Support/CommandLine.h>
+#include <llvm/Support/ScopedPrinter.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <llvm/Support/raw_os_ostream.hv
+#include <llvm/Support/raw_ostream.h>
+#include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/AsmState.h>
+#include <mlir/IR/Block.h>
+#include <mlir/IR/MLIRContext.h>
+#include <mlir/IR/Operation.h>
+#include <mlir/IR/Region.h>
+#include <mlir/IR/Verifier.h>
+#include <mlir/Parser.h>
+#include <mlir/Pass/PassManager.h>
+#include <mlir/Support/LogicalResult.h>
+#include <mlir/Transforms/Passes.h>
 #include <iostream>
 
-#include "llvm/ADT/Optional.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_os_ostream.h"
-#include "llvm/Support/raw_ostream.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/AsmState.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Region.h"
-#include "mlir/IR/Verifier.h"
-#include "mlir/Parser.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/Passes.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/init_infrt_dialects.h"
 
@@ -114,17 +113,15 @@ int main(int argc, char **argv) {
   mlir::registerPassManagerCLOptions();
   cl::ParseCommandLineOptions(argc, argv, "mlir demo");
 
-  mlir::MLIRContext *context = infrt::Global::getMLIRContext();
-  // context->allowUnregisteredDialects();
-  auto &registry = context->getDialectRegistry();
-  infrt::RegisterCinnDialects(registry);
-
+  mlir::DialectRegistry registry;
+  infrt::registerCinnDialects(registry);
+  mlir::MLIRContext context(registry);
   // mlir will verify module automatically after parsing.
   // https://github.com/llvm/llvm-project/blob/38d18d93534d290d045bbbfa86337e70f1139dc2/mlir/lib/Parser/Parser.cpp#L2051
   // mlir::OwningModuleRef module_ref = mlir::parseSourceString(mlir_source,
   // context);
   mlir::OwningModuleRef module_ref =
-      mlir::parseSourceFile(inputFilename, context);
+      mlir::parseSourceFile(inputFilename, &context);
   std::cout << "----------print IR Structure begin----------" << std::endl;
   printOperation(module_ref->getOperation(), 0);
   std::cout << "----------print IR Structure end----------" << std::endl;
diff --git a/paddle/infrt/dialect/tensor_shape.cc b/paddle/infrt/dialect/tensor_shape.cc
index ef5a5525cb22f..92c03818264ee 100644
--- a/paddle/infrt/dialect/tensor_shape.cc
+++ b/paddle/infrt/dialect/tensor_shape.cc
@@ -17,16 +17,16 @@
 #include <llvm/ADT/STLExtras.h>
 #include <mlir/IR/Attributes.h>
 #include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/DialectImplementation.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/IR/OpImplementation.h>
-#include <mlir/IR/StandardTypes.h>
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/Support/LogicalResult.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 using namespace mlir;  // NOLINT
 
 void TensorShapeDialect::initialize() {
@@ -48,8 +48,8 @@ Type TensorShapeDialect::parseType(DialectAsmParser &parser) const {
   return Type();
 }
 
-void TensorShapeDialect::printType(::mlir::Type type,
-                                   ::mlir::DialectAsmPrinter &os) const {
+void TensorShapeDialect::printType(mlir::Type type,
+                                   mlir::DialectAsmPrinter &os) const {
   if (type.isa<ShapeType>()) {
     os << "shape";
     return;
@@ -61,8 +61,10 @@ void TensorShapeDialect::printType(::mlir::Type type,
   }
   llvm_unreachable("unexpected 'shape' type kind");
 }
+}  // namespace ts
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.cpp.inc"  // NOLINT
 
-}  // namespace infrt::ts
+#include "paddle/infrt/dialect/tensor_shape_dialect.cpp.inc"
diff --git a/paddle/infrt/dialect/tensor_shape.h b/paddle/infrt/dialect/tensor_shape.h
index bd3fa8853675a..af892af735d2a 100644
--- a/paddle/infrt/dialect/tensor_shape.h
+++ b/paddle/infrt/dialect/tensor_shape.h
@@ -17,7 +17,8 @@
 #include <mlir/IR/OpDefinition.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::ts {
+namespace infrt {
+namespace ts {
 
 class ShapeType
     : public mlir::Type::TypeBase<ShapeType, mlir::Type, mlir::TypeStorage> {
@@ -31,10 +32,9 @@ class PartialShapeType : public mlir::Type::TypeBase<PartialShapeType,
  public:
   using Base::Base;
 };
+}  // namespace ts
+}  // namespace infrt
 
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensor_shape.hpp.inc"
 #include "paddle/infrt/dialect/tensor_shape_dialect.hpp.inc"
-
-}  // namespace infrt::ts
diff --git a/paddle/infrt/dialect/tensor_shape_base.td b/paddle/infrt/dialect/tensor_shape_base.td
index ea1c1854d77ca..c3988307f4dd5 100644
--- a/paddle/infrt/dialect/tensor_shape_base.td
+++ b/paddle/infrt/dialect/tensor_shape_base.td
@@ -19,7 +19,7 @@ def TensorShapeDialect : Dialect {
 def TS_Shape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::ShapeType>()">, "!ts.shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.shape type` represents a static tensor shape.
 }];
 }
@@ -27,7 +27,7 @@ BuildableType<"$_builder.getType<::infrt::ts::ShapeType>()"> {
 def TS_PartialShape : DialectType<TensorShapeDialect,
 CPred<"$_self.isa<::infrt::ts::PartialShapeType>()">, "!ts.partial_shape type">,
 BuildableType<"$_builder.getType<::infrt::ts::PartialShapeType>()"> {
-    let typeDescription = [{
+    let description = [{
         `!ts.partial_shape type` represents either a static tensor shape, unranked
         tensor shape or a ranked tensor shape with unknown dimension sizes.
 }];
diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc
index dc0f2acb2b733..1baef7a3f77fd 100644
--- a/paddle/infrt/dialect/tensorrt/trt_exec.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc
@@ -11,10 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <llvm/Support/CommandLine.h>
+#include <mlir/Pass/PassManager.h>
 #include <iostream>
 #include <string>
-#include "llvm/Support/CommandLine.h"
-#include "mlir/Pass/PassManager.h"
 #include "paddle/infrt/common/global.h"
 #include "paddle/infrt/dialect/mlir_loader.h"
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
index 181f462962aee..1da80ef2c3b10 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h"
 
+#include <llvm/ADT/SetVector.h>
+#include <mlir/Analysis/SliceAnalysis.h>
+#include <mlir/IR/Builders.h>
+#include <paddle/infrt/dialect/pd_ops.h>
 #include <list>
 #include <unordered_set>
 #include <vector>
-#include "llvm/ADT/SetVector.h"
-#include "mlir/Analysis/SliceAnalysis.h"
-#include "mlir/IR/Builders.h"
-#include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
@@ -32,9 +31,9 @@ namespace {
 // Reference the function nameed "FlexibleDFS" but defined in:
 // paddle/fluid/framework/ir/subgraph_detector.cc.
 
-bool reverseDfs(std::vector<::mlir::Operation *> source,
-                const std::function<bool(const ::mlir::Operation *)> &func) {
-  std::unordered_set<const ::mlir::Operation *> visited;
+bool reverseDfs(std::vector<mlir::Operation *> source,
+                const std::function<bool(const mlir::Operation *)> &func) {
+  std::unordered_set<const mlir::Operation *> visited;
   while (!source.empty()) {
     auto node = source.back();
     source.pop_back();
@@ -44,7 +43,7 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
     auto values = node->getOperands();
     for (auto value : values) {
       // if the value is a block argument, the node is nullptr.
-      ::mlir::Operation *node = value.getDefiningOp();
+      mlir::Operation *node = value.getDefiningOp();
       if (node != nullptr && !visited.count(node)) {
         source.emplace_back(node);
       }
@@ -54,19 +53,19 @@ bool reverseDfs(std::vector<::mlir::Operation *> source,
 }
 
 // merge the first&second graph op to a new graph op.
-void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
-                             ::mlir::pd::GraphOp first,
-                             ::mlir::pd::GraphOp second) {
+void mergeTwoAdjacentGraphOp(mlir::OpBuilder &builder,  // NOLINT
+                             mlir::pd::GraphOp first,
+                             mlir::pd::GraphOp second) {
   // comput inputs and outputs
-  ::llvm::SmallVector<::mlir::Value, 4> inputs(first.getOperands()), outputs;
-  for (::mlir::Value input : second.getOperands()) {
+  ::llvm::SmallVector<mlir::Value, 4> inputs(first.getOperands()), outputs;
+  for (mlir::Value input : second.getOperands()) {
     if (input.getDefiningOp() != first) {
       inputs.push_back(input);
     }
   }
-  ::llvm::DenseMap<::mlir::Value, unsigned int> op_output_mapping;
-  for (::mlir::Value output : first.getResults()) {
-    for (::mlir::Operation *user : output.getUsers()) {
+  ::llvm::DenseMap<mlir::Value, unsigned int> op_output_mapping;
+  for (mlir::Value output : first.getResults()) {
+    for (mlir::Operation *user : output.getUsers()) {
       if (user != second && user->getParentOp() != second) {
         op_output_mapping[output] = outputs.size();
         outputs.push_back(output);
@@ -74,19 +73,19 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
       }
     }
   }
-  auto fetch_op = second.getBody()->getTerminator();
-  outputs.append(fetch_op->getOperands().begin(),
-                 fetch_op->getOperands().end());
-  ::llvm::SmallVector<::mlir::Type, 4> fetch_types;
+  auto return_op = second.getBody()->getTerminator();
+  outputs.append(return_op->getOperands().begin(),
+                 return_op->getOperands().end());
+  ::llvm::SmallVector<mlir::Type, 4> return_types;
   for (auto value : outputs) {
-    fetch_types.push_back(value.getType());
+    return_types.push_back(value.getType());
   }
 
   // create the new graph op
   builder.setInsertionPoint(first);
   auto loc = first.getLoc();
-  auto graph_op = builder.create<::mlir::pd::GraphOp>(loc, fetch_types, inputs);
-  ::mlir::Block *block = new ::mlir::Block;
+  auto graph_op = builder.create<mlir::pd::GraphOp>(loc, return_types, inputs);
+  mlir::Block *block = new mlir::Block;
   auto copy_range = second.getBody()->without_terminator();
   block->getOperations().splice(block->begin(),
                                 second.getBody()->getOperations(),
@@ -98,18 +97,18 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
                                 copy_range.begin(),
                                 copy_range.end());
   builder.setInsertionPointToEnd(block);
-  builder.create<mlir::pd::FetchOp>(loc, outputs);
+  builder.create<mlir::pd::ReturnOp>(loc, outputs);
   graph_op.body().push_back(block);
 
   // mapping the output
   unsigned int num_result = first.getNumResults();
-  fetch_op = first.getBody()->getTerminator();
+  return_op = first.getBody()->getTerminator();
   for (unsigned int index = 0; index < num_result; ++index) {
     auto origin_value = first.getResult(index);
     if (op_output_mapping.find(origin_value) == op_output_mapping.end()) {
-      origin_value.replaceAllUsesWith(fetch_op->getOperand(index));
+      origin_value.replaceAllUsesWith(return_op->getOperand(index));
     } else {
-      auto inner_value = fetch_op->getOperand(index);
+      auto inner_value = return_op->getOperand(index);
       auto outer_value = graph_op.getResult(op_output_mapping[origin_value]);
       while (!origin_value.use_empty()) {
         auto replace_value =
@@ -128,13 +127,13 @@ void mergeTwoAdjacentGraphOp(::mlir::OpBuilder &builder,  // NOLINT
 
 // Topological sort the function op.
 void topoSortBlock(mlir::Block &body) {  // NOLINT
-  llvm::SetVector<Operation *> toSort;
+  llvm::SetVector<mlir::Operation *> toSort;
   if (body.empty()) return;
   for (auto it = body.rbegin(); it != body.rend(); ++it) {
     toSort.insert(&*it);
   }
-  llvm::SetVector<Operation *> result =
-      ::mlir::topologicalSort(std::move(toSort));
+  llvm::SetVector<mlir::Operation *> result =
+      mlir::topologicalSort(std::move(toSort));
   for (auto *op : result) {
     op->moveBefore(body.getTerminator());
   }
@@ -145,21 +144,21 @@ void topoSortBlock(mlir::Block &body) {  // NOLINT
 // Implementation of the trtGraphFusePass.
 void trtGraphFusePass::runOnFunction() {
   mlir::Block &body = getFunction().front();
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   bool changed = false;
   do {
     changed = false;
     for (auto &op : body) {
-      ::mlir::pd::GraphOp graph_op =
-          ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+      mlir::pd::GraphOp graph_op =
+          ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
       if (nullptr == graph_op) continue;
 
       for (auto user_op : op.getUsers()) {
-        ::mlir::pd::GraphOp user_graph_op =
-            ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(user_op);
+        mlir::pd::GraphOp user_graph_op =
+            ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(user_op);
         if (nullptr == user_graph_op) continue;
         // get all dst input nodes except src.
-        std::vector<::mlir::Operation *> source_nodes;
+        std::vector<mlir::Operation *> source_nodes;
         for (auto operand : user_op->getOperands()) {
           auto input = operand.getDefiningOp();
           if (input != &op && input != nullptr) {
@@ -167,9 +166,8 @@ void trtGraphFusePass::runOnFunction() {
           }
         }
         // Reverse DFS from the source_nodes.
-        if (!reverseDfs(source_nodes, [&op](const ::mlir::Operation *n) {
-              return n == &op;
-            })) {
+        if (!reverseDfs(source_nodes,
+                        [&op](const mlir::Operation *n) { return n == &op; })) {
           mergeTwoAdjacentGraphOp(builder, graph_op, user_graph_op);
           changed = true;
           break;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
index e7134e88f316c..f1e555c6f67ec 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -28,15 +28,15 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" %m
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" %m
  *  } ...
  *  "pd.fetch" %d, %f
  *
@@ -47,13 +47,13 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" %n, %s
  *  } ...
  *  "pd.fetch" %d, %f
  * }
  */
 class trtGraphFusePass
-    : public ::mlir::PassWrapper<trtGraphFusePass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphFusePass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphFusePass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
index 2b45364de2036..257f2b5285425 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
@@ -22,24 +22,24 @@ namespace infrt {
 namespace trt {
 // Implementation of the trtGraphSplitPass。
 void trtGraphSplitPass::runOnFunction() {
-  std::vector<::mlir::pd::GraphOp> worklist;
-  ::mlir::Block& block = getFunction().front();
+  std::vector<mlir::pd::GraphOp> worklist;
+  mlir::Block& block = getFunction().front();
   for (auto& op : block) {
-    ::mlir::pd::GraphOp graph_op =
-        ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(&op);
+    mlir::pd::GraphOp graph_op =
+        ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(&op);
     if (nullptr != graph_op &&
         graph_op.getBody()->getOperations().size() <= min_subgraph_size_) {
       worklist.push_back(graph_op);
     }
   }
   while (!worklist.empty()) {
-    ::mlir::pd::GraphOp graph_op = worklist.back();
+    mlir::pd::GraphOp graph_op = worklist.back();
     worklist.pop_back();
-    ::mlir::Block* body = graph_op.getBody();
-    auto fetch_op = body->getTerminator();
-    graph_op.replaceAllUsesWith(fetch_op->getOperands());
+    mlir::Block* body = graph_op.getBody();
+    auto return_op = body->getTerminator();
+    graph_op.replaceAllUsesWith(return_op->getOperands());
     auto copy_range = body->without_terminator();
-    block.getOperations().splice(::mlir::Block::iterator(graph_op),
+    block.getOperations().splice(mlir::Block::iterator(graph_op),
                                  body->getOperations(),
                                  copy_range.begin(),
                                  copy_range.end());
diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
index 092df0cf834e5..d30d186647fc3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -31,9 +31,9 @@ namespace trt {
  *     %m = "pd.conv2d"(%a)...
  *     %n = "pd.conv3d"(%m)...
  *     %s = "pd.conv2d"(%a)...
- *     "pd.fetch" %n, %s
+ *     "pd.return" (%n, %s)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -42,11 +42,11 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  */
 class trtGraphSplitPass
-    : public ::mlir::PassWrapper<trtGraphSplitPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtGraphSplitPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
index 7b7fbb05c1d13..4e8d40b982b2e 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc
@@ -14,49 +14,48 @@
 
 #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h"
 
-#include "mlir/IR/Builders.h"
+#include <mlir/IR/Builders.h>
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/tensorrt/trt_ops.h"
 
 namespace infrt {
 namespace trt {
 // Implementation of the trtOpTellerPass。
 void trtOpTellerPass::runOnFunction() {
-  ::mlir::Block &body = getFunction().front();
-  std::vector<::mlir::Operation *> worklist;
+  mlir::Block &body = getFunction().front();
+  std::vector<mlir::Operation *> worklist;
   worklist.reserve(body.getOperations().size());
   for (auto &op : body) {
     worklist.push_back(&op);
   }
   // Build GraphOp.
-  ::mlir::OpBuilder builder(&body, body.begin());
+  mlir::OpBuilder builder(&body, body.begin());
   while (!worklist.empty()) {
     auto *op = worklist.back();
     worklist.pop_back();
     if (op == nullptr) continue;
-    auto op1 = ::llvm::dyn_cast_or_null<::mlir::pd::FeedOp>(op);
+    auto op1 = ::llvm::dyn_cast_or_null<mlir::pd::FeedOp>(op);
     if (op1) continue;
-    auto op2 = ::llvm::dyn_cast_or_null<::mlir::pd::FetchOp>(op);
+    auto op2 = ::llvm::dyn_cast_or_null<mlir::pd::FetchOp>(op);
     if (op2) continue;
-    auto op3 = ::llvm::dyn_cast_or_null<::mlir::pd::GraphOp>(op);
+    auto op3 = ::llvm::dyn_cast_or_null<mlir::pd::GraphOp>(op);
     if (op3) continue;
     builder.setInsertionPoint(op);
     auto loc = getFunction().getLoc();
-    auto graph_op = builder.create<::mlir::pd::GraphOp>(
+    auto graph_op = builder.create<mlir::pd::GraphOp>(
         loc, op->getResultTypes(), op->getOperands());
 
-    ::llvm::SmallVector<::mlir::Value, 4> tblgen_repl_values;
+    ::llvm::SmallVector<mlir::Value, 4> tblgen_repl_values;
     for (auto v :
-         ::llvm::SmallVector<::mlir::Value, 4>{graph_op.getODSResults(0)}) {
+         ::llvm::SmallVector<mlir::Value, 4>{graph_op.getODSResults(0)}) {
       tblgen_repl_values.push_back(v);
     }
     op->replaceAllUsesWith(tblgen_repl_values);
     // Build graph op.
-    ::mlir::Block *block = new ::mlir::Block;
+    mlir::Block *block = new mlir::Block;
     graph_op.body().push_back(block);
     op->moveBefore(block, block->begin());
     builder.setInsertionPointToEnd(block);
-    builder.create<mlir::pd::FetchOp>(loc, op->getResults());
+    builder.create<mlir::pd::ReturnOp>(loc, op->getResults());
   }
 }
 }  // namespace trt
diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
index b03945b3459c0..fb16c974f7fb3 100644
--- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
+++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/Pass/Pass.h"
+#include <mlir/Pass/Pass.h>
 
 namespace infrt {
 namespace trt {
@@ -29,7 +29,7 @@ namespace trt {
  *  %c = "pd.conv2d"(%a) ...
  *  %d = "pd.conv3d"(%c) ...
  *  %f = "pd.conv2d"(%a) ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  *
  * destination func:
@@ -37,23 +37,23 @@ namespace trt {
  *  %a = "pd.feed"()...
  *  %c = "pd.graph"(%a) {
  *     %m = "pd.conv2d"(%a)...
- *     "pd.fetch" %m
+ *     "pd.return" (%m)
  *  } ...
  *  %d = "pd.graph"(%c) {
  *      %m = "pd.conv3d"(%c)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
  *  %f = "pd.graph"(%a) {
  *      %m = "pd.conv2d"(%a)...
- *      "pd.fetch" %m
+ *      "pd.return" (%m)
  *  } ...
- *  "pd.fetch" %d, %f
+ *  "pd.fetch" (%d, %f)
  * }
  * TODO(winter-wang): Supplementary how to judge the operators can be supported
  * by tensorrt.
  */
 class trtOpTellerPass
-    : public ::mlir::PassWrapper<trtOpTellerPass, ::mlir::FunctionPass> {
+    : public mlir::PassWrapper<trtOpTellerPass, mlir::FunctionPass> {
  public:
   ::llvm::StringRef getName() const override { return "trtOpTellerPass"; }
   void runOnFunction() override;
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc
index 4c02238b10e1d..35b7967892caf 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.cc
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc
@@ -13,27 +13,25 @@
 // limitations under the License.
 
 #include "paddle/infrt/dialect/tensorrt/trt_ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/PatternMatch.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-TensorRTDialect::TensorRTDialect(::mlir::MLIRContext *context)
-    : ::mlir::Dialect("trt", context, ::mlir::TypeID::get<TensorRTDialect>()) {
+TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context)
+    : mlir::Dialect("trt", context, mlir::TypeID::get<TensorRTDialect>()) {
   addOperations<
 #define GET_OP_LIST
 #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
       >();
-#undef GET_OP_LIST
 }
 
-#define GET_OP_CLASSES
-#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
-#undef GET_OP_CLASSES
-
 }  // namespace trt
 }  // namespace infrt
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h
index c9043c2280de0..a37491ec1abc7 100644
--- a/paddle/infrt/dialect/tensorrt/trt_ops.h
+++ b/paddle/infrt/dialect/tensorrt/trt_ops.h
@@ -14,37 +14,32 @@
 
 #pragma once
 
-#include "mlir/Dialect/Traits.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Dialect.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-#include "mlir/Interfaces/CallInterfaces.h"
-#include "mlir/Interfaces/DerivedAttributeOpInterface.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/Dialect/Traits.h>
+#include <mlir/IR/Attributes.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Dialect.h>
+#include <mlir/IR/Matchers.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
+#include <mlir/Interfaces/CallInterfaces.h>
+#include <mlir/Interfaces/DerivedAttributeOpInterface.h>
+#include <mlir/Interfaces/InferTypeOpInterface.h>
+#include <mlir/Interfaces/LoopLikeInterface.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
 namespace infrt {
 namespace trt {
 
-class TensorRTDialect : public ::mlir::Dialect {
+class TensorRTDialect : public mlir::Dialect {
  public:
-  explicit TensorRTDialect(::mlir::MLIRContext* context);
+  explicit TensorRTDialect(mlir::MLIRContext* context);
   static llvm::StringRef getDialectNamespace() { return "trt"; }
 };
 
-// mlir bug。 can be removed safety when update mlir to llvm11.
-using namespace mlir;  // NOLINT
+}  // namespace trt
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/tensorrt/trt_ops.hpp.inc"
-#undef GET_OP_CLASSES
-
-}  // namespace trt
-}  // namespace infrt
diff --git a/paddle/infrt/dialect/test_kernels.cc b/paddle/infrt/dialect/test_kernels.cc
index 894d96f95ad5c..c4588d7cf8bab 100644
--- a/paddle/infrt/dialect/test_kernels.cc
+++ b/paddle/infrt/dialect/test_kernels.cc
@@ -14,14 +14,13 @@
 
 #include "paddle/infrt/dialect/test_kernels.h"
 
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/IR/TypeUtilities.h"
-
-namespace infrt::dialect {
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/IR/OpImplementation.h>
+#include <mlir/IR/TypeUtilities.h>
 
+namespace infrt {
+namespace dialect {
 //===----------------------------------------------------------------------===//
 // BenchmarkOp
 //===----------------------------------------------------------------------===//
@@ -32,65 +31,67 @@ namespace infrt::dialect {
 // ...
 // }
 
-static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
-                                    OperationState &result) {  // NOLINT
-  StringAttr nameAttr;
+static mlir::ParseResult parseBenchmarkOp(
+    mlir::OpAsmParser &parser,       // NOLINT
+    mlir::OperationState &result) {  // NOLINT
+  mlir::StringAttr nameAttr;
   if (parser.parseAttribute(nameAttr, "name", result.attributes))
-    return failure();
+    return mlir::failure();
 
   // Parse the operands, e.g. (%c : i32, %d : f32)
-  if (parser.parseLParen()) return failure();
+  if (parser.parseLParen()) return mlir::failure();
 
-  SmallVector<OpAsmParser::OperandType, 4> operands;
-  SmallVector<Type, 4> types;
+  llvm::SmallVector<mlir::OpAsmParser::OperandType, 4> operands;
+  llvm::SmallVector<mlir::Type, 4> types;
   llvm::SMLoc type_loc = parser.getCurrentLocation();
 
   if (parser.parseOptionalRParen()) {
     // Parse non-empty operands
     do {
       // Parse %c : i32,
-      OpAsmParser::OperandType operand;
-      Type type;
+      mlir::OpAsmParser::OperandType operand;
+      mlir::Type type;
 
       if (parser.parseOperand(operand) || parser.parseColonType(type))
-        return failure();
+        return mlir::failure();
 
       operands.push_back(operand);
       types.push_back(type);
     } while (succeeded(parser.parseOptionalComma()));
 
-    if (parser.parseRParen()) return failure();
+    if (parser.parseRParen()) return mlir::failure();
   }
 
   if (parser.resolveOperands(operands, types, type_loc, result.operands))
-    return failure();
+    return mlir::failure();
 
   // Parse the keyword attribute, e.g. max_count = 100, duration_secs = 1
   do {
-    StringRef attr;
-    Attribute resultAttr;
+    mlir::StringRef attr;
+    mlir::Attribute resultAttr;
     if (parser.parseKeyword(&attr) || parser.parseEqual() ||
         parser.parseAttribute(resultAttr,
                               parser.getBuilder().getIntegerType(32),
                               attr,
                               result.attributes))
-      return failure();
-  } while (succeeded(parser.parseOptionalComma()));
+      return mlir::failure();
+  } while (mlir::succeeded(parser.parseOptionalComma()));
 
   // Set the default attribute num_warmup_runs to 1 if unset
   auto setDefaultAttrIfUnset = [&](const char *attr_name, int value) {
     bool found = llvm::any_of(result.attributes,
-                              [attr_name](const NamedAttribute &attr) {
-                                return attr.first == attr_name;
+                              [attr_name](const mlir::NamedAttribute &attr) {
+                                return attr.getName() == attr_name;
                               });
     if (!found) {
-      IntegerAttr default_val = parser.getBuilder().getI32IntegerAttr(value);
+      mlir::IntegerAttr default_val =
+          parser.getBuilder().getI32IntegerAttr(value);
       result.addAttribute(attr_name, default_val);
     }
   };
   setDefaultAttrIfUnset("num_warmup_runs", 1);
 
-  Region *target = result.addRegion();
+  mlir::Region *target = result.addRegion();
   return parser.parseRegion(*target,
                             operands,
                             types,
@@ -102,11 +103,11 @@ static ParseResult parseBenchmarkOp(OpAsmParser &parser,       // NOLINT
 //       max_count = 100, duration_secs = 1 {
 // ...
 // }
-static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
+static void print(mlir::OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p << "infrt.benchmark ";
 
   // Print the name attribute, e.g "add.i32"
-  auto name_attr = op.getAttr("name");
+  auto name_attr = op->getAttr("name");
   p << name_attr;
 
   // Print the operands and types, e.g. (%c : i32, %d : f32)
@@ -120,13 +121,13 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
 
   bool need_comma = false;
   // Print the attributes, e.g. max_count = 100, duration_secs = 1
-  for (auto &name_attr : op.getAttrs()) {
-    auto id = name_attr.first;
+  for (auto &name_attr : op->getAttrs()) {
+    auto id = name_attr.getName();
     if (id == "name") continue;
     if (need_comma) p << ", ";
-    auto attr = name_attr.second;
+    auto attr = name_attr.getValue();
     p << id << " = ";
-    if (auto int_attr = attr.dyn_cast<IntegerAttr>()) {
+    if (auto int_attr = attr.dyn_cast<mlir::IntegerAttr>()) {
       int_attr.getValue().print(p.getStream(), /*isSigned=*/false);
     } else {
       op.emitOpError("Unexpected attribute");
@@ -142,7 +143,7 @@ static void print(OpAsmPrinter &p, BenchmarkOp op) {  // NOLINT
   p.printRegion(op.region(), /*printEntryBlockArgs=*/false);
 }
 
-static LogicalResult verify(BenchmarkOp op) {
+static mlir::LogicalResult verify(BenchmarkOp op) {
   // Verify that the target benchmark region has exactly one return value.
   auto &region = op.region();
   auto &last_op = region.front().back();
@@ -154,10 +155,10 @@ static LogicalResult verify(BenchmarkOp op) {
         "incorrect number of return values. One return value is expected");
   }
 
-  return success();
+  return mlir::success();
 }
+}  // namespace dialect
+}  // namespace infrt
 
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.cpp.inc"
-
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/test_kernels.h b/paddle/infrt/dialect/test_kernels.h
index 29d4209cb7280..73c8a6fb387bc 100644
--- a/paddle/infrt/dialect/test_kernels.h
+++ b/paddle/infrt/dialect/test_kernels.h
@@ -13,11 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include "mlir/IR/OpDefinition.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include <mlir/IR/OpDefinition.h>
+#include <mlir/Interfaces/SideEffectInterfaces.h>
 
-namespace infrt::dialect {
-using namespace mlir;  // NOLINT
 #define GET_OP_CLASSES
 #include "paddle/infrt/dialect/test_kernels.hpp.inc"
-}  // namespace infrt::dialect
diff --git a/paddle/infrt/dialect/types.cc b/paddle/infrt/dialect/types.cc
deleted file mode 100644
index 6d6f6a20b46c9..0000000000000
--- a/paddle/infrt/dialect/types.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/infrt/dialect/types.h"
-
-namespace infrt::hlir::mlir {}  // namespace infrt::hlir::mlir
diff --git a/paddle/infrt/dialect/types.h b/paddle/infrt/dialect/types.h
deleted file mode 100644
index a9a2b61871cc0..0000000000000
--- a/paddle/infrt/dialect/types.h
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <mlir/IR/StandardTypes.h>
diff --git a/paddle/infrt/host_context/core_runtime.cc b/paddle/infrt/host_context/core_runtime.cc
index cdb8cc99ecb26..e3917bd07d242 100644
--- a/paddle/infrt/host_context/core_runtime.cc
+++ b/paddle/infrt/host_context/core_runtime.cc
@@ -23,7 +23,8 @@
 #include "paddle/infrt/host_context/op_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct CoreRuntime::Impl {
   KernelRegistry* kernel_registry{};
@@ -90,4 +91,5 @@ llvm::SmallVector<ValueRef, 4> CoreRuntime::GetResults(
 
 CoreRuntime::~CoreRuntime() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/core_runtime.h b/paddle/infrt/host_context/core_runtime.h
index 802f8b17bb010..acb6a66cac630 100644
--- a/paddle/infrt/host_context/core_runtime.h
+++ b/paddle/infrt/host_context/core_runtime.h
@@ -22,7 +22,8 @@
 
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 class OpExecutable;
@@ -83,4 +84,5 @@ class CoreRuntimeBuilder : public CoreRuntime {
   OpExecutableBuilder* NewOpExecutable(const std::string& op_name);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h
index 20cb17dc7fbe2..5186b88fe2c41 100644
--- a/paddle/infrt/host_context/kernel_frame.h
+++ b/paddle/infrt/host_context/kernel_frame.h
@@ -21,7 +21,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "paddle/infrt/host_context/value.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 /**
  * KernelFrame captures the states(input arguments, attributes, results)
@@ -163,4 +164,5 @@ class KernelFrameBuilder : public KernelFrame {
   }
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc
index f36ec2a1cac7d..7fca56343041c 100644
--- a/paddle/infrt/host_context/kernel_registry_test.cc
+++ b/paddle/infrt/host_context/kernel_registry_test.cc
@@ -18,7 +18,8 @@
 
 #include "paddle/infrt/host_context/kernel_utils.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 
@@ -44,4 +45,5 @@ TEST(KernelRegistry, basic) {
   ASSERT_EQ(results[0]->get<int>(), 3);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc
index 1904eb106a293..bebd8d86e50bb 100644
--- a/paddle/infrt/host_context/kernel_utils_test.cc
+++ b/paddle/infrt/host_context/kernel_utils_test.cc
@@ -16,7 +16,8 @@
 
 #include <gtest/gtest.h>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 int add_i32(int a, int b) { return a + b; }
 float add_f32(float a, float b) { return a + b; }
@@ -66,4 +67,5 @@ TEST(KernelImpl, pair) {
   ASSERT_EQ(results[1]->get<float>(), 3.f);
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_function_executable.cc b/paddle/infrt/host_context/mlir_function_executable.cc
index 5f8dacf8e448a..47ec27ebec300 100644
--- a/paddle/infrt/host_context/mlir_function_executable.cc
+++ b/paddle/infrt/host_context/mlir_function_executable.cc
@@ -15,6 +15,7 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 
 #include <glog/logging.h>
+#include <mlir/IR/BuiltinOps.h>
 
 #include <string>  // NOLINT
 
diff --git a/paddle/infrt/host_context/mlir_function_executable.h b/paddle/infrt/host_context/mlir_function_executable.h
index ba5fa154d6fcc..a6428df86e6b2 100644
--- a/paddle/infrt/host_context/mlir_function_executable.h
+++ b/paddle/infrt/host_context/mlir_function_executable.h
@@ -13,7 +13,8 @@
 // limitations under the License.
 
 #pragma once
-#include <mlir/IR/Function.h>
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 
 #include <string>
 #include <unordered_map>
diff --git a/paddle/infrt/host_context/mlir_program_executor.h b/paddle/infrt/host_context/mlir_program_executor.h
index b2af4d2d79db5..c2ccb90640b21 100644
--- a/paddle/infrt/host_context/mlir_program_executor.h
+++ b/paddle/infrt/host_context/mlir_program_executor.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
-#include <mlir/IR/Module.h>
 #include <mlir/IR/OperationSupport.h>
 #include <unordered_map>
 
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
index 25324b1291582..3dbc7a702be38 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc
@@ -16,8 +16,9 @@
 
 #include <llvm/Support/SourceMgr.h>
 #include <mlir/Dialect/StandardOps/IR/Ops.h>
+#include <mlir/IR/BuiltinOps.h>
+#include <mlir/IR/BuiltinTypes.h>
 #include <mlir/IR/Diagnostics.h>
-#include <mlir/IR/Function.h>
 #include <mlir/IR/OperationSupport.h>
 #include <mlir/Parser.h>
 
@@ -40,7 +41,8 @@
 #include "paddle/infrt/host_context/value.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 template <typename T>
 std::string DumpToString(T& op) {  // NOLINT
@@ -113,10 +115,10 @@ bool MlirToRuntimeTranslator::EmitConstantOp(mlir::Operation* op) {
 
 template <>
 boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(32)) {
       return val.getInt();
     }
@@ -125,10 +127,10 @@ boost::optional<int32_t> MlirToRuntimeTranslator::EmitAttribute(
 }
 template <>
 boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::IntegerAttr>()) return boost::none;
-  if (attr->isa<mlir::IntegerAttr>()) {
-    auto val = attr->cast<mlir::IntegerAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::IntegerAttr>()) return boost::none;
+  if (attr.isa<mlir::IntegerAttr>()) {
+    auto val = attr.cast<mlir::IntegerAttr>();
     if (val.getType().isInteger(64)) {
       return val.getInt();
     }
@@ -139,10 +141,10 @@ boost::optional<int64_t> MlirToRuntimeTranslator::EmitAttribute(
 // TODO(Superjomn) Make double and float parsing share some thing.
 template <>
 boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF32()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -150,10 +152,10 @@ boost::optional<float> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::FloatAttr>()) return boost::none;
-  if (attr->isa<mlir::FloatAttr>()) {
-    auto val = attr->cast<mlir::FloatAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::FloatAttr>()) return boost::none;
+  if (attr.isa<mlir::FloatAttr>()) {
+    auto val = attr.cast<mlir::FloatAttr>();
     if (val.getType().isF64()) return val.getValueAsDouble();
   }
   return boost::none;
@@ -161,17 +163,17 @@ boost::optional<double> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::string> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::StringAttr>()) return boost::none;
-  return attr->cast<mlir::StringAttr>().getValue().str();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::StringAttr>()) return boost::none;
+  return attr.cast<mlir::StringAttr>().getValue().str();
 }
 
 #define PROCESS_ARRAY_INT(type__, bits__)                                      \
   template <>                                                                  \
   boost::optional<std::vector<type__>> MlirToRuntimeTranslator::EmitAttribute( \
-      const mlir::Attribute* attr) {                                           \
-    if (!attr->isa<mlir::ArrayAttr>()) return boost::none;                     \
-    auto array = attr->cast<mlir::ArrayAttr>();                                \
+      const mlir::Attribute& attr) {                                           \
+    if (!attr.isa<mlir::ArrayAttr>()) return boost::none;                      \
+    auto array = attr.cast<mlir::ArrayAttr>();                                 \
     CHECK(!array.empty());                                                     \
                                                                                \
     if (!array[0].getType().isInteger(bits__)) {                               \
@@ -191,9 +193,9 @@ PROCESS_ARRAY_INT(int64_t, 64);
 
 template <>
 boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF32()) return boost::none;
@@ -207,9 +209,9 @@ boost::optional<std::vector<float>> MlirToRuntimeTranslator::EmitAttribute(
 
 template <>
 boost::optional<std::vector<double>> MlirToRuntimeTranslator::EmitAttribute(
-    const mlir::Attribute* attr) {
-  if (!attr->isa<mlir::ArrayAttr>()) return boost::none;
-  auto array = attr->cast<mlir::ArrayAttr>();
+    const mlir::Attribute& attr) {
+  if (!attr.isa<mlir::ArrayAttr>()) return boost::none;
+  auto array = attr.cast<mlir::ArrayAttr>();
   CHECK(!array.empty());
 
   if (!array[0].getType().isF64()) return boost::none;
@@ -236,7 +238,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
   for (int i = 0, e = op->getNumOperands(); i < e; i++) {
     // function argument as value
     auto operand = op->getOperand(i);
-    if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    /// if (operand.getKind() == mlir::Value::Kind::BlockArgument) {
+    if (operand.isa<mlir::BlockArgument>()) {
       mlir::BlockArgument arg = operand.dyn_cast<mlir::BlockArgument>();
       Value* arg_value = GetValue(arg);
       impl_->cur_op->AppendArgument(arg_value);
@@ -283,25 +286,25 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
 
   for (size_t i = 0; i < attrs.size(); i++) {
     auto& attr = attrs[i];
-    if (auto v = EmitAttribute<int32_t>(&attr.second)) {
+    if (auto v = EmitAttribute<int32_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<int64_t>(&attr.second)) {
+    } else if (auto v = EmitAttribute<int64_t>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<float>(&attr.second)) {
+    } else if (auto v = EmitAttribute<float>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<double>(&attr.second)) {
+    } else if (auto v = EmitAttribute<double>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(*v));
-    } else if (auto v = EmitAttribute<std::string>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::string>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int16_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int16_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int32_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int32_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<int64_t>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<int64_t>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<float>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<float>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
-    } else if (auto v = EmitAttribute<std::vector<double>>(&attr.second)) {
+    } else if (auto v = EmitAttribute<std::vector<double>>(attr.getValue())) {
       impl_->cur_op->AppendAttribute(new Value(std::move(*v)));
     } else {
       LOG(FATAL) << "Not supported attribute type";
@@ -330,7 +333,7 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) {
     llvm::SmallVector<mlir::Type, 0> results;
 
     auto func_type =
-        mlir::FunctionType::get(inputs, results, region.getContext());
+        mlir::FunctionType::get(region.getContext(), inputs, results);
     auto* function = impl_->cur_op->CreateFunctionExecutable(
         &region, func_type, &impl_->func_defs);
     impl_->cur_op->AppendAttribute(new Value(function));
@@ -555,4 +558,5 @@ void TestMlir(mlir::ModuleOp module, KernelRegistry* registry) {
   execute.Run();
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.h b/paddle/infrt/host_context/mlir_to_runtime_translate.h
index 598e81bfd96d8..fcd79eaf386ee 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate.h
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate.h
@@ -29,7 +29,8 @@ class Attribute;
 class Value;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class CoreRuntimeBuilder;
 class Value;
@@ -73,7 +74,7 @@ class MlirToRuntimeTranslator {
   bool EmitCallOp(mlir::Operation* op, function_defs_t* function_table);
 
   template <typename T>
-  boost::optional<T> EmitAttribute(const mlir::Attribute* attr);
+  boost::optional<T> EmitAttribute(const mlir::Attribute& attr);
 
   Value* GetOpResult(mlir::Operation* op);
 
@@ -104,4 +105,5 @@ void MlirToRuntimeTranslate(mlir::ModuleOp module, CoreRuntimeBuilder* runtime);
  */
 void TestMlir(mlir::ModuleOp module, KernelRegistry* registry);
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
index 9b85be977ab6c..375daa4515e17 100644
--- a/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
+++ b/paddle/infrt/host_context/mlir_to_runtime_translate_test.cc
@@ -29,7 +29,8 @@
 #include "paddle/infrt/kernel/tensor_shape_kernels.h"
 #include "paddle/infrt/kernel/test_kernels.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 TEST(MlirToRuntimeTranslate, basic) {
   mlir::MLIRContext context;
@@ -48,7 +49,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -74,7 +75,7 @@ func @main() -> () {
 )ROC";
 
   auto module = dialect::LoadMlirSource(&context, source);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   KernelRegistry registry;
   kernel::RegisterFloatBasicKernels(&registry);
@@ -115,7 +116,7 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   // LOG(INFO) << "content: " << content << std::endl;
 
   auto module = dialect::LoadMlirSource(context, content);
-  module->verify();
+  EXPECT_TRUE(mlir::succeeded(module->verify()));
 
   host_context::KernelRegistry registry;
 
@@ -157,4 +158,5 @@ infrt.return %a0, %b0: !infrt.tensor<X86, NCHW, F32>, !infrt.tensor<X86, NCHW, F
   }
 }
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc
index 6b10ed473719e..cf40d7315c6a5 100644
--- a/paddle/infrt/host_context/op_executable.cc
+++ b/paddle/infrt/host_context/op_executable.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/infrt/host_context/op_executable.h"
 
+#include <mlir/IR/BuiltinOps.h>
 #include <string>
 
 #include "paddle/infrt/host_context/kernel_frame.h"
@@ -21,7 +22,8 @@
 #include "paddle/infrt/host_context/mlir_function_executable.h"
 #include "paddle/infrt/host_context/symbol_table.h"
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct OpExecutable::Impl {
   Impl(const std::string& op_name,
@@ -148,4 +150,5 @@ void OpExecutable::Execute() {
 
 OpExecutable::~OpExecutable() {}
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/host_context/op_executable.h b/paddle/infrt/host_context/op_executable.h
index e2248225a5caf..550f6ab6349ed 100644
--- a/paddle/infrt/host_context/op_executable.h
+++ b/paddle/infrt/host_context/op_executable.h
@@ -14,19 +14,18 @@
 
 #pragma once
 #include <llvm/ADT/ArrayRef.h>
-
+#include <mlir/IR/BuiltinTypes.h>
+#include <mlir/IR/Region.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
 
-#include "mlir/IR/Function.h"
-#include "mlir/IR/Region.h"
-
 namespace mlir {
 class FuncOp;
 }  // namespace mlir
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class SymbolTable;
 class KernelRegistry;
@@ -89,4 +88,5 @@ class OpExecutableBuilder : public OpExecutable {
       function_defs_t* function_defs);
 };
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.cc b/paddle/infrt/kernel/basic_kernels.cc
index d7f2c3865157d..b186cfcfd2b35 100644
--- a/paddle/infrt/kernel/basic_kernels.cc
+++ b/paddle/infrt/kernel/basic_kernels.cc
@@ -23,7 +23,8 @@
 
 using infrt::host_context::Attribute;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 template <typename T>
 T add(T a, T b) {
@@ -82,4 +83,5 @@ void RegisterFloatBasicKernels(host_context::KernelRegistry *registry) {
   registry->AddKernel("infrt.print.f32", INFRT_KERNEL(print<float>));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/basic_kernels.h b/paddle/infrt/kernel/basic_kernels.h
index 9e98885cf6ebf..feb66be61f530 100644
--- a/paddle/infrt/kernel/basic_kernels.h
+++ b/paddle/infrt/kernel/basic_kernels.h
@@ -15,13 +15,16 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the basic kernels to \p registry.
@@ -31,4 +34,5 @@ void RegisterBasicKernels(host_context::KernelRegistry* registry);
 void RegisterIntBasicKernels(host_context::KernelRegistry* registry);
 void RegisterFloatBasicKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc
index 2fa477aa4dbda..51e0004922374 100644
--- a/paddle/infrt/kernel/tensor_kernels.cc
+++ b/paddle/infrt/kernel/tensor_kernels.cc
@@ -25,7 +25,8 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 using namespace host_context;  // NOLINT
 using namespace tensor;        // NOLINT
 
@@ -76,4 +77,5 @@ void RegisterTensorKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShallowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_kernels.h b/paddle/infrt/kernel/tensor_kernels.h
index 8f2180ba80a4f..df8e25c32393c 100644
--- a/paddle/infrt/kernel/tensor_kernels.h
+++ b/paddle/infrt/kernel/tensor_kernels.h
@@ -14,12 +14,16 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 struct KernelRegistry;
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.cc b/paddle/infrt/kernel/tensor_shape_kernels.cc
index a04b492819298..4edbecfa10886 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.cc
+++ b/paddle/infrt/kernel/tensor_shape_kernels.cc
@@ -24,7 +24,8 @@
 #include "paddle/infrt/host_context/kernel_utils.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void PrintShape(const tensor::TensorShape& shape) {
   llvm::raw_os_ostream oos(std::cout);
@@ -35,4 +36,5 @@ void RegisterTensorShapeKernels(host_context::KernelRegistry* registry) {
   registry->AddKernel("ts.print_shape", INFRT_KERNEL(PrintShape));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/tensor_shape_kernels.h b/paddle/infrt/kernel/tensor_shape_kernels.h
index e87c6c37e88a0..e31a37463be43 100644
--- a/paddle/infrt/kernel/tensor_shape_kernels.h
+++ b/paddle/infrt/kernel/tensor_shape_kernels.h
@@ -14,14 +14,18 @@
 
 #pragma once
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 class KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 void RegisterTensorShapeKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.cc b/paddle/infrt/kernel/test_kernels.cc
index d5f64d09b602f..ccfb3356a855f 100644
--- a/paddle/infrt/kernel/test_kernels.cc
+++ b/paddle/infrt/kernel/test_kernels.cc
@@ -33,7 +33,8 @@ using infrt::host_context::Attribute;
 using infrt::host_context::MlirFunctionExecutable;
 using infrt::host_context::RemainingArguments;
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 namespace {
 class BenchmarkStats {
  public:
@@ -197,4 +198,5 @@ void RegisterTestKernels(host_context::KernelRegistry *registry) {
                       INFRT_KERNEL(ShadowCopyTensor));
 }
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/test_kernels.h b/paddle/infrt/kernel/test_kernels.h
index f42884dfaf2c9..f5639ec1afaad 100644
--- a/paddle/infrt/kernel/test_kernels.h
+++ b/paddle/infrt/kernel/test_kernels.h
@@ -15,17 +15,21 @@
 #pragma once
 #include <string>
 
-namespace infrt::host_context {
+namespace infrt {
+namespace host_context {
 
 struct KernelRegistry;
 
-}  // namespace infrt::host_context
+}  // namespace host_context
+}  // namespace infrt
 
-namespace infrt::kernel {
+namespace infrt {
+namespace kernel {
 
 /**
  * Register all the test kernels to registry.
  */
 void RegisterTestKernels(host_context::KernelRegistry* registry);
 
-}  // namespace infrt::kernel
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/cpp/desc_api.h b/paddle/infrt/paddle/cpp/desc_api.h
index ccd79c048ab14..3b2dcb0018b2f 100644
--- a/paddle/infrt/paddle/cpp/desc_api.h
+++ b/paddle/infrt/paddle/cpp/desc_api.h
@@ -18,7 +18,9 @@
 #include <string>
 #include <vector>
 
-namespace infrt::paddle::cpp {
+namespace infrt {
+namespace paddle {
+namespace cpp {
 
 /*
  * Compatible interfaces for all the different kinds of XXXDesc. All the XXXDesc
@@ -226,4 +228,6 @@ class ProgramDescAPI {
   virtual void SetVersion(int64_t version) = 0;
 };
 
-}  // namespace infrt::paddle::cpp
+}  // namespace cpp
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.cc b/paddle/infrt/paddle/model_parser.cc
index 285280e69435b..f3de1a630451c 100644
--- a/paddle/infrt/paddle/model_parser.cc
+++ b/paddle/infrt/paddle/model_parser.cc
@@ -22,7 +22,8 @@
 #include "paddle/infrt/common/target.h"
 #include "paddle/infrt/common/type.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 
 int SizeOfType(framework_proto::VarType::Type type) {
   using Type = framework_proto::VarType::Type;
@@ -169,4 +170,5 @@ void LoadParam(const std::string &path, _Variable *out, const Target &target) {
   LoadLoDTensor(fin, out, target);
 }
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/model_parser.h b/paddle/infrt/paddle/model_parser.h
index 73125fadedb82..373f77033dcef 100644
--- a/paddle/infrt/paddle/model_parser.h
+++ b/paddle/infrt/paddle/model_parser.h
@@ -25,7 +25,8 @@
 #include "paddle/infrt/paddle/scope.h"
 #include "paddle/infrt/paddle/tensor.h"
 
-namespace infrt::paddle {
+namespace infrt {
+namespace paddle {
 namespace framework_proto = ::paddle::framework::proto;
 
 // Read a __model__ file.
@@ -52,4 +53,5 @@ void TensorFromStream(
     const common::Target& target = common::DefaultHostTarget());
 void ReadBinaryFile(const std::string& filename, std::string* contents);
 
-}  // namespace infrt::paddle
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.cc b/paddle/infrt/paddle/pb/block_desc.cc
index 11186bc68af16..5b28fa5464c54 100644
--- a/paddle/infrt/paddle/pb/block_desc.cc
+++ b/paddle/infrt/paddle/pb/block_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/block_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::VarDesc* BlockDesc::GetVar<framework_proto::VarDesc>(
@@ -40,4 +42,6 @@ framework_proto::OpDesc* BlockDesc::AddOp<framework_proto::OpDesc>() {
   return desc_->add_ops();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/block_desc.h b/paddle/infrt/paddle/pb/block_desc.h
index 9c1b7f9adf172..c9e325699a4bc 100644
--- a/paddle/infrt/paddle/pb/block_desc.h
+++ b/paddle/infrt/paddle/pb/block_desc.h
@@ -18,7 +18,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -74,4 +76,6 @@ class BlockDesc : public cpp::BlockDescAPI {
   framework_proto::BlockDesc* desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.cc b/paddle/infrt/paddle/pb/op_desc.cc
index c7b1e66f50642..32dcefb1ac684 100644
--- a/paddle/infrt/paddle/pb/op_desc.cc
+++ b/paddle/infrt/paddle/pb/op_desc.cc
@@ -14,7 +14,9 @@
 
 #include "paddle/infrt/paddle/pb/op_desc.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 google::protobuf::internal::RepeatedPtrIterator<framework_proto::OpDesc_Attr>
 FindAttr(framework_proto::OpDesc *desc, const std::string &name) {
@@ -136,4 +138,6 @@ GET_ATTRS_IMPL(std::vector<std::string>, strings);
 GET_ATTR_IMPL(std::string, s);
 GET_ATTRS_IMPL(std::vector<int64_t>, longs);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/op_desc.h b/paddle/infrt/paddle/pb/op_desc.h
index 81d57d9f32252..2829f2aca2e08 100644
--- a/paddle/infrt/paddle/pb/op_desc.h
+++ b/paddle/infrt/paddle/pb/op_desc.h
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/framework.pb.h"
 #include "paddle/infrt/support/variant.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 namespace framework_proto = ::paddle::framework::proto;
 
@@ -195,4 +197,6 @@ template <>
 void OpDesc::SetAttr<std::vector<int>>(const std::string &name,
                                        const std::vector<int> &v);
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.cc b/paddle/infrt/paddle/pb/program_desc.cc
index ed8a7e36e0129..9d725485a974d 100644
--- a/paddle/infrt/paddle/pb/program_desc.cc
+++ b/paddle/infrt/paddle/pb/program_desc.cc
@@ -17,7 +17,9 @@
 #include <algorithm>
 #include <limits>
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 template <>
 framework_proto::BlockDesc* ProgramDesc::GetBlock<framework_proto::BlockDesc>(
@@ -32,4 +34,6 @@ ProgramDesc::AddBlock<framework_proto::BlockDesc>() {
   return desc_->add_blocks();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/program_desc.h b/paddle/infrt/paddle/pb/program_desc.h
index 4adad650c974d..b1e64f8e86611 100644
--- a/paddle/infrt/paddle/pb/program_desc.h
+++ b/paddle/infrt/paddle/pb/program_desc.h
@@ -21,7 +21,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 class ProgramDesc : public cpp::ProgramDescAPI {
@@ -58,4 +60,6 @@ class ProgramDesc : public cpp::ProgramDescAPI {
   framework_proto::ProgramDesc *desc_;  // not_own
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.cc b/paddle/infrt/paddle/pb/var_desc.cc
index cf80df4f1b845..7ea2e24da3446 100644
--- a/paddle/infrt/paddle/pb/var_desc.cc
+++ b/paddle/infrt/paddle/pb/var_desc.cc
@@ -19,7 +19,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 
 cpp::VarDescAPI::Type VarDesc::GetType() const {
   auto type = desc_->type().type();
@@ -364,4 +366,6 @@ VarDesc::mutable_tensor_descs() {
   return std::vector<framework_proto::VarType::TensorDesc *>();
 }
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt
diff --git a/paddle/infrt/paddle/pb/var_desc.h b/paddle/infrt/paddle/pb/var_desc.h
index 4cff5fdee0375..7215ba6bb6aa7 100644
--- a/paddle/infrt/paddle/pb/var_desc.h
+++ b/paddle/infrt/paddle/pb/var_desc.h
@@ -23,7 +23,9 @@
 #include "paddle/infrt/paddle/cpp/desc_api.h"
 #include "paddle/infrt/paddle/framework.pb.h"
 
-namespace infrt::paddle::pb {
+namespace infrt {
+namespace paddle {
+namespace pb {
 namespace framework_proto = ::paddle::framework::proto;
 
 // convert between std::vector and protobuf repeated.
@@ -121,4 +123,6 @@ class VarDesc : public cpp::VarDescAPI {
   framework_proto::VarDesc *desc_;
 };
 
-}  // namespace infrt::paddle::pb
+}  // namespace pb
+}  // namespace paddle
+}  // namespace infrt

From 87ee3e4f5438c567796e128b73eb7703aa56d2ec Mon Sep 17 00:00:00 2001
From: Zhangjingyu06 <92561254+Zhangjingyu06@users.noreply.github.com>
Date: Fri, 14 Jan 2022 16:15:47 +0800
Subject: [PATCH 03/10] [XPU]add stack_grad op for kunlun2,*test=kunlun
 (#38674)

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun2,*test=kunlun

* [XPU]add split op for kunlun,*test=kunlun

* [XPU]add stack_grad op for kunlun2,*test=kunlun

Co-authored-by: QingshuChen <chenqingshu@baidu.com>
---
 paddle/fluid/operators/stack_op_xpu.cc        | 43 ++++++++++++++++---
 .../fluid/platform/device/xpu/xpu1_op_list.h  |  1 +
 .../fluid/platform/device/xpu/xpu2_op_list.h  |  2 +
 .../tests/unittests/xpu/test_stack_op_xpu.py  | 19 +++++++-
 4 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc
index 01ec4a2b16b4a..a2590e1180c1a 100644
--- a/paddle/fluid/operators/stack_op_xpu.cc
+++ b/paddle/fluid/operators/stack_op_xpu.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/stack_op.h"
 #include <string>
-#ifdef PADDLE_WITH_XPU
+#include <vector>
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
@@ -59,14 +62,44 @@ class StackXPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class StackGradXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    auto axis = ctx.Attr<int>("axis");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto dy_dims = dy->dims();
+
+    if (axis < 0) axis += dy_dims.size() + 1;
+    auto dy_shape = framework::vectorize<int>(dy_dims);
+
+    std::vector<int> dx_dims_list(dx.size(), 1);
+    std::vector<T*> dx_lists;
+    for (auto out : dx) {
+      dx_lists.push_back(out->mutable_data<T>(ctx.GetPlace()));
+    }
+
+    int r = xpu::split<T>(dev_ctx.x_context(), dy->data<T>(), dx_lists,
+                          dy_shape, dx_dims_list, axis);
+    PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
+                      platform::errors::External(
+                          "The stack_grad XPU kernel return wrong value[%d %s]",
+                          r, XPUAPIErrorMsg[r]));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace plat = paddle::platform;
 namespace ops = paddle::operators;
-
 REGISTER_OP_XPU_KERNEL(stack,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>,
+                       ops::StackXPUKernel<plat::XPUDeviceContext, float>,
                        ops::StackXPUKernel<plat::XPUDeviceContext, int>,
-                       ops::StackXPUKernel<plat::XPUDeviceContext, float>);
+                       ops::StackXPUKernel<plat::XPUDeviceContext, int64_t>);
+REGISTER_OP_XPU_KERNEL(stack_grad,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, float>,
+                       ops::StackGradXPUKernel<plat::XPUDeviceContext, int>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index 26a1426bea036..a76bdd4ae9679 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -300,6 +300,7 @@ XPUOpMap& get_kl1_ops() {
                                 pOpKernelType(vartype::UINT8, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"tanh", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 79261a5d7bc88..3d140b4693a6f 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -333,6 +333,8 @@ XPUOpMap& get_kl2_ops() {
       {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                               pOpKernelType(vartype::INT64, XPUPlace()),
                               pOpKernelType(vartype::INT32, XPUPlace())})},
+      {"stack_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                                   pOpKernelType(vartype::INT32, XPUPlace())})},
       {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                             pOpKernelType(vartype::FP16, XPUPlace())})},
       {"tanh_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
index 68e5a6ccdbfb7..20446aee41ec7 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_stack_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -66,6 +66,15 @@ def test_check_output(self):
             place = paddle.XPUPlace(0)
             self.check_output_with_place(place)
 
+    def test_check_grad(self):
+        if self.dtype == 'int64' or self.dtype == 'int32':
+            pass
+        else:
+            if paddle.is_compiled_with_xpu():
+                paddle.enable_static()
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, self.get_x_names(), 'Y')
+
 
 class TestStackOp1(TestStackOpBase):
     def initParameters(self):
@@ -81,11 +90,17 @@ class TestStackOp3(TestStackOpBase):
     def initParameters(self):
         self.axis = -1
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp4(TestStackOpBase):
     def initParameters(self):
         self.axis = -4
 
+    def test_check_grad(self):
+        pass
+
 
 class TestStackOp5(TestStackOpBase):
     def initParameters(self):
@@ -113,7 +128,7 @@ def initDefaultParameters(self):
         self.num_inputs = 4
         self.input_dim = (5, 6, 7)
         self.axis = 0
-        self.dtype = 'int'
+        self.dtype = 'int32'
 
     def initParameters(self):
         self.num_inputs = 16

From 050aa6fe5a524b0e7b85201c54a0da315701518d Mon Sep 17 00:00:00 2001
From: heliqi <heliqi@baidu.com>
Date: Fri, 14 Jan 2022 16:50:56 +0800
Subject: [PATCH 04/10]  add flatten_contiguous_range OpConvert for Paddle-TRT
 (#38922)

* add trt_convert_flatten_contiguous_rang op

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* trt version >7,support trt_convert_flatten_contiguous_rang

* test cast add trt version >=7 skip
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |   7 +-
 .../fluid/inference/api/analysis_predictor.cc |   1 +
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../convert/flatten_contiguous_range_op.cc    | 136 ++++++++++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |  32 +++++
 ...st_trt_convert_flatten_contiguous_range.py | 115 +++++++++++++++
 6 files changed, 290 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index ef50df3084f8c..55bbc55450876 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -46,8 +46,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
               << " is diabled by config in TensorRT";
       return false;
     }
-    return tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
-                                             with_dynamic_shape);
+    bool is_ok = tensorrt::OpTeller::Global().Tell(node, no_calib_int8,
+                                                   with_dynamic_shape);
+    if (!is_ok)
+      VLOG(3) << node->Op()->Type().c_str() << " op is not in TensorRT";
+    return is_ok;
   };
 
   framework::ir::SubGraphFuser fuser(
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 2799fb9e174d3..d4b680288e347 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1416,6 +1416,7 @@ USE_TRT_CONVERTER(elementwise_min_tensor);
 USE_TRT_CONVERTER(elementwise_pow_tensor);
 USE_TRT_CONVERTER(transpose);
 USE_TRT_CONVERTER(flatten);
+USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index a885b69fa7fbc..017caca6adc81 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,7 +3,7 @@ nv_library(tensorrt_converter
            SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc
+                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
                 emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
new file mode 100644
index 0000000000000..706814340a0e9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/flatten_contiguous_range_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * flatten_contiguous_range trt converter
+ */
+class FlattenContiguousRangeOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    int dims = input->getDimensions().nbDims;
+    int start_axis = BOOST_GET_CONST(int, op_desc.GetAttr("start_axis"));
+    int stop_axis = BOOST_GET_CONST(int, op_desc.GetAttr("stop_axis"));
+
+    nvinfer1::IShuffleLayer* layer = nullptr;
+    if (!engine_->with_dynamic_shape()) {
+      if (start_axis < 0) start_axis += dims + 1;
+      if (stop_axis < 0) stop_axis += dims + 1;
+      int dim_prod = 1;
+      nvinfer1::Dims flatten_dim;
+      flatten_dim.nbDims = dims - (stop_axis - start_axis);
+      for (int i = 0, j = 0; i < dims; ++i) {
+        if (start_axis <= i + 1 && i + 1 <= stop_axis) {
+          int dim_i = input->getDimensions().d[i];
+          PADDLE_ENFORCE_GT(dim_i, 0, platform::errors::InvalidArgument(
+                                          "flatten_contiguous_range input dim "
+                                          "should be > 0, but got %d.",
+                                          dim_i));
+          dim_prod *= dim_i;
+          if (i + 1 == stop_axis) {
+            flatten_dim.d[j++] = dim_prod;
+          }
+        } else {
+          flatten_dim.d[j++] = input->getDimensions().d[i];
+        }
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setReshapeDimensions(flatten_dim);
+    } else {
+      if (start_axis < 0) start_axis += dims;
+      if (stop_axis < 0) stop_axis += dims;
+      auto* shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
+      auto* shape_layer_itensor = shape_layer->getOutput(0);
+
+      nvinfer1::Dims start_dim, size_dim, stride_dim;
+      start_dim.nbDims = 1;
+      size_dim.nbDims = 1;
+      stride_dim.nbDims = 1;
+      start_dim.d[0] = start_axis;
+      size_dim.d[0] = stop_axis - start_axis + 1;
+      stride_dim.d[0] = 1;
+      auto* slice_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Slice, *shape_layer_itensor, start_dim,
+                               size_dim, stride_dim);
+      uint32_t reduce_dim = 1;
+      auto* reduce_prod_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Reduce, *(slice_layer->getOutput(0)),
+          nvinfer1::ReduceOperation::kPROD, reduce_dim, true);
+
+      nvinfer1::ITensor* input_shape = nullptr;
+      if (start_axis == 0 && stop_axis == dims - 1) {
+        input_shape = reduce_prod_layer->getOutput(0);
+      } else {
+        std::vector<nvinfer1::ITensor*> itensors;
+        if (start_axis > 0) {
+          nvinfer1::Dims left_start_dim, left_size_dim, left_stride_dim;
+          left_start_dim.nbDims = 1;
+          left_size_dim.nbDims = 1;
+          left_stride_dim.nbDims = 1;
+          left_start_dim.d[0] = 0;
+          left_size_dim.d[0] = start_axis;
+          left_stride_dim.d[0] = 1;
+          auto* slice_layer_left = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, left_start_dim,
+              left_size_dim, left_stride_dim);
+          itensors.push_back(slice_layer_left->getOutput(0));
+        }
+        itensors.push_back(reduce_prod_layer->getOutput(0));
+        if (stop_axis < dims - 1) {
+          nvinfer1::Dims right_start_dim, right_size_dim, right_stride_dim;
+          right_start_dim.nbDims = 1;
+          right_size_dim.nbDims = 1;
+          right_stride_dim.nbDims = 1;
+          right_start_dim.d[0] = stop_axis + 1;
+          right_size_dim.d[0] = dims - stop_axis - 1;
+          right_stride_dim.d[0] = 1;
+          auto* slice_layer_right = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *shape_layer_itensor, right_start_dim,
+              right_size_dim, right_stride_dim);
+          itensors.push_back(slice_layer_right->getOutput(0));
+        }
+        auto* concat_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Concatenation, itensors.data(), itensors.size());
+        concat_layer->setAxis(0);
+        input_shape = concat_layer->getOutput(0);
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+      layer->setInput(1, *input_shape);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "flatten_contiguous_range", {output_name},
+                             test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(flatten_contiguous_range,
+                          FlattenContiguousRangeOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index ddee4e0d682b0..6663103d4ca37 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -55,6 +55,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 // #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
+    teller_set.insert("flatten_contiguous_range");
 #endif
 #if CUDA_VERSION >= 10020
     teller_set.insert("reshape");
@@ -531,6 +532,37 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         if (axis != 1) return false;
       }
     }
+    if (op_type == "flatten_contiguous_range") {
+      if (!with_dynamic_shape) {
+        int start_axis = BOOST_GET_CONST(int, desc.GetAttr("start_axis"));
+        int stop_axis = BOOST_GET_CONST(int, desc.GetAttr("stop_axis"));
+        auto x_var_name = desc.Input("X")[0];
+        auto* block = desc.Block();
+        if (block == nullptr) {
+          VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                     "Developers need to check whether block_desc is passed in "
+                     "the pass.";
+          return false;
+        }
+        auto* x_var_desc = block->FindVar(x_var_name);
+        const auto x_shape = x_var_desc->GetShape();
+        int dims = x_shape.size();
+        if (start_axis < 0) start_axis += dims;
+        if (start_axis == 0) {
+          VLOG(3) << "TRT flatten_contiguous_range not support the "
+                     "batch-dimension being changed";
+          return false;
+        }
+        if (stop_axis < 0) stop_axis += dims;
+        for (int i = start_axis; i <= stop_axis; ++i) {
+          if (x_shape[i] < 0) {
+            VLOG(3) << "On TRT static shape,flatten_contiguous_range input dim "
+                       "should be > 0";
+            return false;
+          }
+        }
+      }
+    }
 
     if (op_type == "gather") {
       auto gather_inputs = desc.Inputs();
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
new file mode 100644
index 0000000000000..a4060349d4bed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten_contiguous_range.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertFlattenContiguousRangeTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input(batch):
+            return np.random.random([2, batch, 4, 8, 3]).astype(np.float32)
+
+        for batch in [1, 2, 4]:
+            for start_axis in range(5):
+                for stop_axis in range(start_axis, 5):
+                    type = "flatten_contiguous_range"
+                    op_outputs = {
+                        "Out": ["output_data"],
+                        "XShape": ["xshape_data"]
+                    }
+                    ops_config = [{
+                        "op_type": type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": op_outputs,
+                        "op_attrs": {
+                            "start_axis": start_axis,
+                            "stop_axis": stop_axis,
+                        }
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch))
+                        },
+                        outputs=["output_data"])
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {"input_data": [2, 1, 4, 8, 3]}
+            self.dynamic_shape.max_input_shape = {"input_data": [2, 4, 4, 8, 3]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [2, 2, 4, 8, 3]}
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[0] * 10 >= 7000:
+                if dynamic_shape:
+                    return 1, 2
+                else:
+                    if attrs[0]['start_axis'] == 0:
+                        return 0, 3
+                    else:
+                        return 1, 2
+            else:
+                return 0, 3
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From a88791481484ab6a61540a737336d79c65d021dc Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Sat, 15 Jan 2022 12:39:49 +0800
Subject: [PATCH 05/10] fix performance problem caused by Conj (#38939)

---
 paddle/pten/kernels/complex_kernel.h | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h
index b6074f117ea14..d12fc730fef87 100644
--- a/paddle/pten/kernels/complex_kernel.h
+++ b/paddle/pten/kernels/complex_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/infermeta/unary.h"
 #include "paddle/pten/kernels/empty_kernel.h"
@@ -23,7 +24,13 @@ namespace pten {
 template <typename T, typename Context>
 void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
-template <typename T, typename Context>
+// If T is complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              std::is_same<T, paddle::platform::complex<float>>::value ||
+                  std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
 DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   auto out_meta = UnchangedInferMeta(x.meta());
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
@@ -31,4 +38,15 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
   return dense_out;
 }
 
+// If T is not complex
+template <typename T,
+          typename Context,
+          std::enable_if_t<
+              !std::is_same<T, paddle::platform::complex<float>>::value &&
+                  !std::is_same<T, paddle::platform::complex<double>>::value,
+              bool> = true>
+DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) {
+  return x;
+}
+
 }  // namespace pten

From 88966b283952096f81aab4918b7d83b303aabad2 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 15 Jan 2022 14:35:33 +0800
Subject: [PATCH 06/10] [Unify Tensors PR #7] Merged LoDTensor with Tensor,
 test=allcases (#38880)

* Merged LoDTensor with Tensor,test=allcases

* Patched python level LoDTensor

* Fixed example code failure

* Polished function names, removed duplicated forward declarations
---
 paddle/fluid/distributed/fleet.h              |   2 +-
 .../fluid/distributed/service/brpc_utils.cc   |   2 +-
 .../test/brpc_service_dense_sgd_test.cc       |   2 +-
 .../test/brpc_service_sparse_sgd_test.cc      |   2 +-
 paddle/fluid/framework/data_feed.h            |   2 +-
 .../framework/details/fetch_async_op_handle.h |   2 +-
 .../framework/details/variable_visitor.cc     |   2 +-
 paddle/fluid/framework/device_worker.cc       |   2 +-
 paddle/fluid/framework/device_worker.h        |   3 +-
 paddle/fluid/framework/downpour_worker.cc     |   2 +-
 paddle/fluid/framework/feed_fetch_method.cc   |   2 +-
 paddle/fluid/framework/feed_fetch_method.h    |   2 +-
 .../ir/conv_affine_channel_fuse_pass.cc       |   2 +-
 .../fluid/framework/ir/conv_bn_fuse_pass.cc   |   2 +-
 .../framework/ir/delete_dropout_op_pass.cc    |   2 +-
 .../ir/delete_quant_dequant_op_pass.cc        |   2 +-
 .../ir/fusion_group/code_generator_tester.cc  |   2 +-
 paddle/fluid/framework/lod_tensor.cc          |  28 --
 paddle/fluid/framework/lod_tensor.h           |  24 +-
 paddle/fluid/framework/naive_executor.h       |   2 +-
 paddle/fluid/framework/operator.cc            |  12 +-
 paddle/fluid/framework/operator.h             |   6 -
 paddle/fluid/framework/pull_dense_worker.cc   |   2 +-
 paddle/fluid/framework/tensor.h               |   9 +-
 paddle/fluid/framework/tensor_util.cc         |  24 ++
 paddle/fluid/framework/tensor_util.h          |   4 +-
 paddle/fluid/framework/trainer.h              |   2 +-
 paddle/fluid/framework/var_type_traits.h      |   7 +-
 paddle/fluid/inference/api/api_impl.h         |   2 +-
 .../api/details/reset_tensor_array.h          |   2 +-
 paddle/fluid/operators/assert_op.cc           |   2 +-
 paddle/fluid/operators/assign_op.h            |   2 +-
 .../operators/controlflow/while_op_helper.h   |   2 +-
 paddle/fluid/operators/math/beam_search.cc    |   1 -
 .../fluid/operators/math/beam_search_npu.cc   |   1 -
 .../fluid/operators/math/sequence_padding.cc  |   1 -
 paddle/fluid/operators/math/sequence_scale.cc |   2 +-
 paddle/fluid/operators/math/sequence_scale.h  |   2 +-
 paddle/fluid/operators/memcpy_d2h_op.h        |   2 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |   2 +-
 paddle/fluid/operators/memcpy_op.h            |   2 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc |   1 -
 paddle/fluid/operators/print_op.cc            |   2 +-
 paddle/fluid/operators/recurrent_op.cc        |   2 +-
 .../reorder_lod_tensor_by_rank_op.cc          |   2 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |   2 +-
 paddle/fluid/operators/tensor_formatter.h     |   2 +-
 paddle/fluid/operators/transfer_layout_op.h   |   2 +-
 paddle/fluid/platform/lodtensor_printer.cc    |   2 +-
 paddle/fluid/pybind/pybind.cc                 | 277 ++++++------------
 paddle/pten/api/lib/utils/tensor_utils.cc     |  51 ++--
 paddle/pten/api/lib/utils/tensor_utils.h      |  18 +-
 python/paddle/fluid/__init__.py               |   5 +
 54 files changed, 203 insertions(+), 343 deletions(-)

diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 6d9ce01535e9d..697dbb9170f18 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -36,7 +36,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 class SelectedRows;
 class Variable;
diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc
index 6eb8462977b60..db55c9ad438a7 100644
--- a/paddle/fluid/distributed/service/brpc_utils.cc
+++ b/paddle/fluid/distributed/service/brpc_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class Variable;
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index 68d1d457500c7..c0c1fda4c4fca 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -31,7 +31,7 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 8fb3434af6e28..471750feaefef 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -32,7 +32,7 @@ class PSClient;
 class PSServer;
 }  // namespace distributed
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index a4100e66e7285..2533acaa6d35a 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -50,7 +50,7 @@ DECLARE_bool(enable_slotrecord_reset_shrink);
 namespace paddle {
 namespace framework {
 class DataFeedDesc;
-class LoDTensor;
+class Tensor;
 class Scope;
 class Variable;
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.h b/paddle/fluid/framework/details/fetch_async_op_handle.h
index f863cc304b8a5..41df0d90aaf81 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.h
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.h
@@ -24,7 +24,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 
 namespace ir {
 class Node;
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index 71e5dd28eded1..56c88e9d25a91 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc
index fbaae5a21c274..3b70ef737f5be 100644
--- a/paddle/fluid/framework/device_worker.cc
+++ b/paddle/fluid/framework/device_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 
 void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; }
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index 15acedf3cf50a..332a584049127 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -43,10 +43,9 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class Scope;
-class Tensor;
 }  // namespace framework
 namespace platform {
 class DeviceContext;
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 11f70acb73aa7..cc97af4b1969d 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 2eac65c90c02f..0c3aafd85f283 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Variable;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h
index 4c2f5b9796a22..dc9310ff5b263 100644
--- a/paddle/fluid/framework/feed_fetch_method.h
+++ b/paddle/fluid/framework/feed_fetch_method.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 
 void SetFeedVariable(Scope* scope, const LoDTensor& input,
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
index 6cd16132c2a10..c883412a9a4c3 100644
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
@@ -20,7 +20,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index b6c410dc957fd..6443d0594a9c5 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -21,7 +21,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
index 09962239a01b1..c0a4f099e39d4 100644
--- a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc
@@ -17,7 +17,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
index b99f2266f39b2..af75646551e28 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc
@@ -18,7 +18,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 0d490d4e669fc..09fd6b8dd1116 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 69a2a6eefaf8c..4681933a66cd3 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -27,34 +27,6 @@ class DeviceContext;
 namespace paddle {
 namespace framework {
 
-std::ostream &operator<<(std::ostream &os, const LoD &lod) {
-  os << "{";
-  for (auto &v : lod) {
-    os << "{";
-    bool is_first = true;
-    for (auto &i : v) {
-      if (is_first) {
-        os << i;
-        is_first = false;
-      } else {
-        os << ", " << i;
-      }
-    }
-    os << "}";
-  }
-  os << "}";
-
-  return os;
-}
-
-std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
-  if (t.lod().size() > 0) {
-    os << "  - lod: " << t.lod() << "\n";
-  }
-  os << static_cast<Tensor>(t);
-  return os;
-}
-
 std::string LoDToString(const LoD &lod) {
   std::ostringstream stream;
   stream << lod;
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 22f2027998137..bbb8f8005168c 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -28,9 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
-namespace framework {
-class LoDTensor;
-}  // namespace framework
 namespace platform {
 class DeviceContext;
 }  // namespace platform
@@ -39,6 +36,8 @@ class DeviceContext;
 namespace paddle {
 namespace framework {
 
+using LoDTensor = paddle::framework::Tensor;
+
 /*
  * LoD is short for Level of Details.
  *
@@ -56,9 +55,6 @@ namespace framework {
  */
 using LoD = std::vector<Vector<size_t>>;
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod);
-std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
-
 std::string LoDToString(const LoD& lod);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
@@ -102,22 +98,6 @@ bool CheckLoD(const LoD& in, int tensor_height = -1);
  */
 bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
 
-/*
- * LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor : public Tensor {
- public:
-  using Tensor::Tensor;
-
-  // Split LoDTensor and copy to each place specified in places.
-  std::vector<LoDTensor> SplitLoDTensor(
-      const std::vector<platform::Place> places) const;
-
-  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
-                      platform::Place place);
-};
-
 /*
  * Expand the `source` to fit the LoD of `lod`. For example, a `source`
  * LoDTensor is
diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h
index ed475e66f626d..f706eabb47988 100644
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
@@ -31,7 +31,7 @@ namespace framework {
  * Simple, intuitive and effective. Only single thread is supported, and
  * currently designed for inference.
  */
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class Scope;
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 93349b8b88449..aa21c8eed256b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -34,7 +34,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 #ifdef PADDLE_WITH_XPU
@@ -555,11 +555,6 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   return it->second.empty() ? nullptr : it->second[0];
 }
 
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
-  return Input<LoDTensor>(name);
-}
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
@@ -584,11 +579,6 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   return res;
 }
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
-  return Output<LoDTensor>(name);
-}
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 9d75c66beb7d4..12946b416cf9f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -479,16 +479,10 @@ class ExecutionArgumentMappingContext : public pten::ArgumentMappingContext {
   const ExecutionContext& ctx_;
 };
 
-template <>
-const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
-template <>
-Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 77d8abcd26e9e..b13aaadc81661 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class LoDTensor;
+class Tensor;
 class Scope;
 class Variable;
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index fcdb837bc80ce..95405820a48d9 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -36,7 +36,7 @@ namespace paddle {
 
 namespace framework {
 
-class LoDTensor;
+using LoD = std::vector<paddle::framework::Vector<size_t>>;
 
 /*
  NOTE(liym27): [ What is TensorInplaceVersion used for? ]
@@ -74,6 +74,13 @@ class Tensor : public pten::DenseTensor {
   using DenseTensor = pten::DenseTensor;
   using DenseTensor::DenseTensor;
 
+  // Split Tensor and copy to each place specified in places.
+  std::vector<Tensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
+
+  void MergeLoDTensor(const std::vector<const Tensor*>& lod_tensors,
+                      platform::Place place);
+
   /*! The internal of two tensors share the same memory block. */
   Tensor& ShareDataWith(const Tensor& src);
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 724e3cc1e2ee8..84334417dc7da 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -1428,7 +1428,31 @@ std::ostream& print_tensor<paddle::platform::complex<double>>(
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+  os << "{";
+  for (auto& v : lod) {
+    os << "{";
+    bool is_first = true;
+    for (auto& i : v) {
+      if (is_first) {
+        os << i;
+        is_first = false;
+      } else {
+        os << ", " << i;
+      }
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, const Tensor& t) {
+  if (t.lod().size() > 0) {
+    os << "  - lod: " << t.lod() << "\n";
+  }
+
   os << "  - place: " << t.place() << "\n";
   os << "  - shape: [" << t.dims() << "]\n";
   os << "  - layout: " << DataLayoutToString(t.layout()) << "\n";
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 11858e4166595..355be39baa2a5 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -39,6 +39,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const Tensor& t);
+
 class PrintOptions {
  public:
   static PrintOptions& Instance() {
@@ -494,6 +497,5 @@ inline void TensorToVector(const Tensor& src, std::vector<bool>* dst) {
   delete[] array;
 }
 
-std::ostream& operator<<(std::ostream& os, const Tensor& t);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 4823c08305760..8bba9492a5686 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -40,7 +40,7 @@ namespace paddle {
 namespace framework {
 
 class Dataset;
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 class PullDenseWorker;
 class Scope;
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index f4c41197a9dfa..715e7a14c5529 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -70,11 +70,10 @@ class BKCLCommunicator;
 namespace framework {
 class LoDRankTable;
 class ScopeBase;
-class LoDTensor;
+class Tensor;
 class ReaderHolder;
 class Scope;
 class SelectedRows;
-class Tensor;
 }  // namespace framework
 
 namespace operators {
@@ -164,8 +163,8 @@ struct VarTypeRegistryImpl {
 // Users should add other variable types below.
 // Paddle would generate unique Ids for each registered variable types.
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
-    Tensor, LoDTensor, SelectedRows, std::vector<Scope *>, LoDRankTable,
-    Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
+    Tensor, SelectedRows, std::vector<Scope *>, LoDRankTable, Strings,
+    LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *,
     operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index be771ac48fc15..bf67cfed35f89 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -35,7 +35,7 @@ limitations under the License. */
 namespace paddle {
 
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 }  // namespace framework
 
diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h
index f12a54cdccedc..857160ad10282 100644
--- a/paddle/fluid/inference/api/details/reset_tensor_array.h
+++ b/paddle/fluid/inference/api/details/reset_tensor_array.h
@@ -23,7 +23,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Scope;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/assert_op.cc b/paddle/fluid/operators/assert_op.cc
index 3e4250389fcfc..466e0e793e4e3 100644
--- a/paddle/fluid/operators/assert_op.cc
+++ b/paddle/fluid/operators/assert_op.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 class Variable;
diff --git a/paddle/fluid/operators/assign_op.h b/paddle/fluid/operators/assign_op.h
index bd314a00424bd..d9648c9617255 100644
--- a/paddle/fluid/operators/assign_op.h
+++ b/paddle/fluid/operators/assign_op.h
@@ -27,7 +27,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h
index 1685da4e95822..8ef12ca05e36a 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.h
+++ b/paddle/fluid/operators/controlflow/while_op_helper.h
@@ -24,7 +24,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class ProgramDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index 5271da91b8c15..c52ba68331580 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/beam_search_npu.cc b/paddle/fluid/operators/math/beam_search_npu.cc
index 6afaaea0673b2..5aede02263dd5 100644
--- a/paddle/fluid/operators/math/beam_search_npu.cc
+++ b/paddle/fluid/operators/math/beam_search_npu.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index e29313e9f742c..491d40d3ae567 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index 8e58411a1f247..f4193bb71fabb 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/math/sequence_scale.h b/paddle/fluid/operators/math/sequence_scale.h
index d84513e024d7f..c6c84bb55dfa7 100644
--- a/paddle/fluid/operators/math/sequence_scale.h
+++ b/paddle/fluid/operators/math/sequence_scale.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/memcpy_d2h_op.h b/paddle/fluid/operators/memcpy_d2h_op.h
index efa8af8054fc8..94eed5cf83fee 100644
--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -24,7 +24,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index a19dc3367a14b..cc6e771d105ae 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -25,7 +25,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index 57dafab1d5bc7..b270d87ad00ea 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -27,7 +27,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 class SelectedRows;
 }  // namespace framework
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 5024148fe5888..dae598ef64220 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 30e788bb395a4..754b46c823b28 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
 class Tensor;
 }  // namespace framework
 namespace platform {
diff --git a/paddle/fluid/operators/print_op.cc b/paddle/fluid/operators/print_op.cc
index c558f1852f54c..cef2993fc30d5 100644
--- a/paddle/fluid/operators/print_op.cc
+++ b/paddle/fluid/operators/print_op.cc
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 556f1bccd1680..7adf7962e1987 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
index d8d4e641aeb3e..4ba071032162a 100644
--- a/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class LoDRankTable;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index fe646b2830b66..0ff622d329919 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 class InferShapeContext;
-class LoDTensor;
+class Tensor;
 class OpDesc;
 class Scope;
 }  // namespace framework
diff --git a/paddle/fluid/operators/tensor_formatter.h b/paddle/fluid/operators/tensor_formatter.h
index 4608663b3ed9b..38e3e7a94a524 100644
--- a/paddle/fluid/operators/tensor_formatter.h
+++ b/paddle/fluid/operators/tensor_formatter.h
@@ -20,7 +20,7 @@
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/transfer_layout_op.h b/paddle/fluid/operators/transfer_layout_op.h
index 1f09aec05b936..28135e37ed7bb 100644
--- a/paddle/fluid/operators/transfer_layout_op.h
+++ b/paddle/fluid/operators/transfer_layout_op.h
@@ -29,7 +29,7 @@ class DeviceContext;
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index d607dbe5b9999..4a5dfbee15de2 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class LoDTensor;
+class Tensor;
 class Variable;
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index b5845a1ef9628..5f4e9a8861390 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -875,12 +875,12 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
            R"DOC(
-        Set the data of LoDTensor on place with given numpy array.
+        Set the data of Tensor on place with given numpy array.
         
         Args:
           lod (numpy.ndarray): The data to set.
           place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace|MLUPlace): The place where the
-          LoDTensor is to be set.
+          Tensor is to be set.
           zero_copy (bool, optional): Whether to share memory with the input numpy array.
           This parameter only works with CPUPlace. Default: False.
 
@@ -893,17 +893,17 @@ PYBIND11_MODULE(core_noavx, m) {
                 import paddle.fluid as fluid
                 import numpy as np
 
-                t = fluid.LoDTensor()
+                t = fluid.Tensor()
                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
           )DOC")
 
       .def("shape",
            [](framework::Tensor &self) { return vectorize(self.dims()); },
            R"DOC(
-           Return the shape of LoDTensor.
+           Return the shape of Tensor.
 
            Returns:
-               list[int]: The shape of LoDTensor.
+               list[int]: The shape of Tensor.
 
 
            Examples:
@@ -912,7 +912,7 @@ PYBIND11_MODULE(core_noavx, m) {
                   import paddle.fluid as fluid
                   import numpy as np
 
-                  t = fluid.LoDTensor()
+                  t = fluid.Tensor()
                   t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                   print(t.shape())  # [5, 30]
            )DOC")
@@ -949,117 +949,34 @@ PYBIND11_MODULE(core_noavx, m) {
            })
       .def("_share_data_with", &framework::Tensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const framework::Tensor &self) {
-        std::stringstream ostr;
-        ostr << self;
-        return ostr.str();
-      });
-
-  // TODO(cql): add reference: en_user_guide_lod_tensor
-  py::class_<LoDTensor, framework::Tensor>(m, "LoDTensor", R"DOC(
-    LoDTensor is a Tensor with optional LoD (Level of Details) information, 
-    it can be used for variable-length sequences, 
-    see :ref:`user_guide_lod_tensor` for details.
-
-    LoDTensor can be converted to numpy array using :code:`numpy.array(lod_tensor)`.
-
-    You can skip the following explanation if you don't need to know details 
-    of LoDTensor.
-
-    The following two examples show how to use LODtensor to represent 
-    variable-length sequences.
-    
-    Example 1:
-    
-    Suppose x is a LoDTensor representing a variable-length sequence. 
-    It contains two logical subsequences, the length of first logical sequence 
-    is 2 (e.g., number of samples is 2), the length of second logical sequence 
-    is 3, and the total length is 5. The data of the first logical sequence is 
-    [1, 2], [3, 4], and the data of the second logical sequence is [5, 6], 
-    [7, 8], [9, 10]. The data dimension of each sample is 2. So, the final 
-    shape of the LoDTensor is [5, 2], of which 5 is the total length and 2 is 
-    the dimension of each sample.
-    
-    Logically, we can represent the variable-length sequence in two ways: one 
-    is in the form of recursive sequence lengths, that is, 
-    x.recursive_sequence_lengths=[[2, 3]]; the other is in the form of offsets, 
-    that is, x.lod=[[0, 2, 2+3]]. These two representations are equivalent, and 
-    you can set and retrieve recursive_sequence_lengths or LoD through the 
-    corresponding interfaces of LoDTensor introduced later.
-
-    Actually, in order to access sequence faster, Paddle uses offset to store 
-    different lengths of sequences. 
-    Therefore, the operations on recursive_sequence_lengths will be converted 
-    to the operations on LoD eventually.
-    
-    .. code-block:: python
-
-      y.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8],
-                [9, 10], [11, 12], [13, 14]]
-
-      y.shape = [2+2+3, 2]
-
-      y.recursive_sequence_lengths = [[2, 1], [2, 2, 3]]
-
-      y.lod = [[0, 2, 3], [0, 2, 4, 7]]
-
-    Example 2:
-
-    LoD may have more than one level (for example, a paragraph may have more 
-    than one sentence and a sentence may have more than one word). Suppose y 
-    is a LoDTensor and its lod_level is 2. 
-    From level = 0, there are two logical sequences, the length of which is 
-    2 and 1, respectively, indicating that the first logical sequence contains 
-    two sub-sequences and the second logical sequence contains one sub-sequence. 
-    From level = 1, the lengths of two sub-sequences contained by the first 
-    logical sequence is 2 and 2, and the length of sub-sequence contained by 
-    the second logical sequence is 3.
-      
-    Therefore, the LoDTensor is represented in the form of recursive sequence 
-    lengths as y.recursive_sequence_lengths=[[2,1], [2,2,3]]; and equally, in 
-    the form of offset, it is represented as y.lod=[[0,2,3], [0,2,4,7]].
-
-    .. code-block:: python
-
-      y.data = [[1, 2], [3, 4],
-                [5, 6], [7, 8],
-                [9, 10], [11, 12], [13, 14]]
-
-      y.shape = [2+2+3, 2]
-
-      y.recursive_sequence_lengths = [[2, 1], [2, 2, 3]]
-
-      y.lod = [[0, 2, 3], [0, 2, 4, 7]]
-
-    Examples:
-        .. code-block:: python
-
-          import paddle.fluid as fluid
-
-          t = fluid.LoDTensor()
-
-        )DOC")
-      .def("__array__",
-           [](framework::Tensor &self) { return TensorToPyArray(self); })
+      .def("__str__",
+           [](const framework::Tensor &self) {
+             std::stringstream ostr;
+             ostr << self;
+             return ostr.str();
+           }) /* ------ End of original Tensor ------ */
+      .def(
+          "__init__",
+          [](framework::Tensor &instance, const std::vector<std::vector<size_t>>
+                                              &recursive_sequence_lengths) {
+            LoD new_lod;
+            new_lod.reserve(recursive_sequence_lengths.size());
+            std::copy(recursive_sequence_lengths.begin(),
+                      recursive_sequence_lengths.end(),
+                      std::back_inserter(new_lod));
+            LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
+            PADDLE_ENFORCE_EQ(
+                CheckLoD(new_offset_lod, -1), true,
+                platform::errors::InvalidArgument(
+                    "The provided recursive_sequence_lengths info is invalid, "
+                    "the LoD converted by recursive_sequence_lengths is %s",
+                    new_lod));
+            new (&instance) framework::Tensor(new_offset_lod);
+          })
       .def("__init__",
-           [](LoDTensor &instance, const std::vector<std::vector<size_t>>
-                                       &recursive_sequence_lengths) {
-             LoD new_lod;
-             new_lod.reserve(recursive_sequence_lengths.size());
-             std::copy(recursive_sequence_lengths.begin(),
-                       recursive_sequence_lengths.end(),
-                       std::back_inserter(new_lod));
-             LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
-             PADDLE_ENFORCE_EQ(
-                 CheckLoD(new_offset_lod, -1), true,
-                 platform::errors::InvalidArgument(
-                     "The provided recursive_sequence_lengths info is invalid, "
-                     "the LoD converted by recursive_sequence_lengths is %s",
-                     new_lod));
-             new (&instance) LoDTensor(new_offset_lod);
+           [](framework::Tensor &instance) {
+             new (&instance) framework::Tensor();
            })
-      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       // We implement offset based LOD in C++ while we use length based with
       // Python API. So we changed set_lod to set_recursive_sequence_lengths
       // to
@@ -1067,7 +984,8 @@ PYBIND11_MODULE(core_noavx, m) {
       // The discussion is here:
       // https://github.com/PaddlePaddle/Paddle/issues/10855
       .def("set_lod",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+           [](framework::Tensor &self,
+              const std::vector<std::vector<size_t>> &lod) {
              // the input lod is offset-based level-of-detail info
              LoD new_lod;
              new_lod.reserve(lod.size());
@@ -1079,7 +997,7 @@ PYBIND11_MODULE(core_noavx, m) {
              self.set_lod(new_lod);
            },
            py::arg("lod"), R"DOC(
-           Set LoD of the LoDTensor.
+           Set LoD of the Tensor.
 
            Args:
                lod (list[list[int]]): The lod to set.
@@ -1093,14 +1011,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       .def("set_recursive_sequence_lengths",
-           [](LoDTensor &self, const std::vector<std::vector<size_t>>
-                                   &recursive_sequence_lengths) {
+           [](framework::Tensor &self, const std::vector<std::vector<size_t>>
+                                           &recursive_sequence_lengths) {
              // the input recursive_sequence_lengths is length-based
              // level-of-detail info
              LoD new_lod;
@@ -1119,7 +1037,7 @@ PYBIND11_MODULE(core_noavx, m) {
              self.set_lod(new_offset_lod);
            },
            py::arg("recursive_sequence_lengths"), R"DOC(
-           Set LoD of the LoDTensor according to recursive sequence lengths.
+           Set LoD of the Tensor according to recursive sequence lengths.
 
            For example, if recursive_sequence_lengths=[[2, 3]], which means
            there are two sequences with length 2 and 3 respectively, the
@@ -1137,14 +1055,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
-                 print(t.recursive_sequence_length())  # [[2, 3]]
+                 print(t.recursive_sequence_lengths())  # [[2, 3]]
                  print(t.lod())  # [[0, 2, 5]]
            )DOC")
       .def("lod",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
              // output the offset-based lod info
              LoD lod = self.lod();
              std::vector<std::vector<size_t>> new_lod;
@@ -1153,10 +1071,10 @@ PYBIND11_MODULE(core_noavx, m) {
              return new_lod;
            },
            R"DOC(
-           Return the LoD of the LoDTensor.
+           Return the LoD of the Tensor.
 
            Returns:
-               list[list[int]]: The lod of the LoDTensor.
+               list[list[int]]: The lod of the Tensor.
            
            Examples:
                .. code-block:: python
@@ -1164,14 +1082,14 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_lod([[0, 2, 5]])
                  print(t.lod()) # [[0, 2, 5]]
            )DOC")
       // Set above comments of set_lod.
       .def("recursive_sequence_lengths",
-           [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+           [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
              // output the length-based lod info
              LoD lod = ConvertToLengthBasedLoD(self.lod());
              std::vector<std::vector<size_t>> new_lod;
@@ -1181,7 +1099,7 @@ PYBIND11_MODULE(core_noavx, m) {
            },
            R"DOC(
            Return the recursive sequence lengths corresponding to of the LodD 
-           of the LoDTensor.
+           of the Tensor.
 
            Returns:
                 list[list[int]]: The recursive sequence lengths.
@@ -1192,19 +1110,19 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.recursive_sequence_lengths()) # [[2, 3]]
            )DOC")
       .def("has_valid_recursive_sequence_lengths",
-           [](LoDTensor &self) -> bool {
+           [](framework::Tensor &self) -> bool {
              // Check that the lod info is valid and match the outermost
-             // dimension of the LoDTensor data
+             // dimension of the Tensor data
              return CheckLoD(self.lod(), vectorize(self.dims()).front());
            },
            R"DOC(
-           Check whether the LoD of the LoDTensor is valid.
+           Check whether the LoD of the Tensor is valid.
 
            Returns:
                bool: Whether the LoD is valid.
@@ -1215,91 +1133,80 @@ PYBIND11_MODULE(core_noavx, m) {
                  import paddle.fluid as fluid
                  import numpy as np
 
-                 t = fluid.LoDTensor()
+                 t = fluid.Tensor()
                  t.set(np.ndarray([5, 30]), fluid.CPUPlace())
                  t.set_recursive_sequence_lengths([[2, 3]])
                  print(t.has_valid_recursive_sequence_lengths()) # True
            )DOC")
-      .def("__getitem__", PySliceTensor, py::return_value_policy::reference,
-           R"DOC(
-           Slice the original Tensor, and remove the LoD information.
-
-           Returns:
-               out (Tensor): new Tensor(NOT LoDTensor).
-           )DOC")
-      .def("__str__",
-           [](const LoDTensor &self) {
-             std::stringstream ostr;
-             ostr << self;
-             return ostr.str();
-           })
       .def("_as_type",
-           [](const LoDTensor &self,
+           [](const framework::Tensor &self,
               paddle::framework::proto::VarType::Type type) {
-             LoDTensor dst;
+             framework::Tensor dst;
              if (self.IsInitialized() && self.numel() > 0) {
                TransDataType(self, type, &dst);
              }
              return dst;
            })
-      .def("_copy", [](const LoDTensor &self, const platform::Place &place) {
-        // follow fetch_op's inplementation
-        LoDTensor dst;
-        if (self.IsInitialized() && self.numel() > 0) {
-          TensorCopySync(self, place, &dst);
-        } else {
-          // Not copy, if the src tensor is empty.
-          dst.clear();
-          dst.Resize({0});
-        }
-        dst.set_lod(self.lod());
-        return dst;
+      .def("_copy",
+           [](const framework::Tensor &self, const platform::Place &place) {
+             // follow fetch_op's inplementation
+             framework::Tensor dst;
+             if (self.IsInitialized() && self.numel() > 0) {
+               TensorCopySync(self, place, &dst);
+             } else {
+               // Not copy, if the src tensor is empty.
+               dst.clear();
+               dst.Resize({0});
+             }
+             dst.set_lod(self.lod());
+             return dst;
 #ifdef _WIN32
-      });
+           });
 #else
            })
       .def(py::pickle(
-          [](const LoDTensor &t) {  // __getstate__
+          [](const framework::Tensor &t) {  // __getstate__
             auto holder = t.Holder();
-            PADDLE_ENFORCE_EQ(
-              platform::is_cpu_place(holder->place()), true,
-              platform::errors::PreconditionNotMet(
-                  "LoDTensor is not on CPU."
-                  "Now only LoDTensor on CPU can be serialized."));
-            auto* mmap_writer_allocation =
-              dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
-                holder.get());
-            PADDLE_ENFORCE_NOT_NULL(mmap_writer_allocation,
-              platform::errors::PreconditionNotMet(
-                "LoDTensor is not in shared memory."
-                "Now only LoDTensor on shared memory can be serialized."));
+            PADDLE_ENFORCE_EQ(platform::is_cpu_place(holder->place()), true,
+                              platform::errors::PreconditionNotMet(
+                                  "Tensor is not on CPU."
+                                  "Now only Tensor on CPU can be serialized."));
+            auto *mmap_writer_allocation =
+                dynamic_cast<memory::allocation::MemoryMapWriterAllocation *>(
+                    holder.get());
+            PADDLE_ENFORCE_NOT_NULL(
+                mmap_writer_allocation,
+                platform::errors::PreconditionNotMet(
+                    "Tensor is not in shared memory."
+                    "Now only Tensor on shared memory can be serialized."));
             int type_idx = static_cast<int>(t.type());
 
             return py::make_tuple(mmap_writer_allocation->ipc_name(),
-                                  mmap_writer_allocation->size(),
-                                  type_idx, vectorize(t.dims()), t.lod());
+                                  mmap_writer_allocation->size(), type_idx,
+                                  vectorize(t.dims()), t.lod());
           },
           [](py::tuple t) {  // __setstate__
             if (t.size() != 5)
-              throw std::runtime_error("Invalid LoDTensor state!");
+              throw std::runtime_error("Invalid Tensor state!");
 
             // 1. Create a new C++ instance
-            LoDTensor tensor;
+            framework::Tensor tensor;
 
             // 2. Rebuild Allocation
             const std::string &ipc_name = t[0].cast<std::string>();
             size_t size = t[1].cast<size_t>();
             auto shared_reader_holder =
-              memory::allocation::RebuildMemoryMapReaderAllocation(
-                ipc_name, size);
+                memory::allocation::RebuildMemoryMapReaderAllocation(ipc_name,
+                                                                     size);
 
             // 3. Maintain global fd set
-            VLOG(3) << "LoDTensor ipc name: " << ipc_name;
+            VLOG(3) << "Tensor ipc name: " << ipc_name;
             memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name);
 
-            // 4. Rebuild LoDTensor
-            tensor.ResetHolderWithType(shared_reader_holder,
-              static_cast<proto::VarType::Type>(t[2].cast<int>()));
+            // 4. Rebuild Tensor
+            tensor.ResetHolderWithType(
+                shared_reader_holder,
+                static_cast<proto::VarType::Type>(t[2].cast<int>()));
             tensor.Resize(make_ddim(t[3].cast<std::vector<int>>()));
             tensor.set_lod(t[4].cast<framework::LoD>());
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 53d641896e43f..edd5cde938630 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -31,7 +31,7 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
   }
 }
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensorBase(
     const paddle::framework::Tensor& src) {
   VLOG(3) << "MakePtenDenseTensor based Tensor.";
   pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
@@ -44,15 +44,15 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src) {
-  auto out =
-      MakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src));
+    const paddle::framework::Tensor& src) {
+  auto out = MakePtenDenseTensorBase(
+      static_cast<const paddle::framework::Tensor&>(src));
   SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
          src.lod());
   return std::move(out);
 }
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensorBase(
     const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
   pten::DenseTensorMeta meta{
       arg_def.dtype, src.dims(), src.layout(), src.offset()};
@@ -71,16 +71,15 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
 }
 
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src,
-    const pten::TensorArgDef& arg_def) {
-  auto out = MakePtenDenseTensor(
+    const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
+  auto out = MakePtenDenseTensorBase(
       static_cast<const paddle::framework::Tensor&>(src), arg_def);
   SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
          src.lod());
   return std::move(out);
 }
 
-pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src) {
+pten::Scalar MakePtenScalar(const paddle::framework::Tensor& src) {
   PADDLE_ENFORCE_EQ(src.numel(),
                     1,
                     paddle::platform::errors::InvalidArgument(
@@ -138,7 +137,7 @@ pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
   }
 }
 
-pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src) {
+pten::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src) {
   if (src.type() == paddle::framework::proto::VarType::INT64) {
     return {src.data<int64_t>(), src.numel()};
   } else if (src.type() == paddle::framework::proto::VarType::INT32) {
@@ -295,7 +294,7 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
   return {};
 }
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+void MovesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   PADDLE_ENFORCE_NOT_NULL(
       src,
       platform::errors::InvalidArgument(
@@ -311,12 +310,12 @@ void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->set_offset(src->meta().offset);
 }
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  MovesStorageBase(src, static_cast<paddle::framework::Tensor*>(dst));
   SetLoD(dst->mutable_lod(), src->lod());
 }
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   PADDLE_ENFORCE_NOT_NULL(
       src,
       platform::errors::InvalidArgument(
@@ -333,13 +332,13 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
   dst->set_offset(src->meta().offset);
 }
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  SharesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  SharesStorageBase(src, static_cast<paddle::framework::Tensor*>(dst));
   SetLoD(dst->mutable_lod(), src->lod());
 }
 
-void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
-                           pten::DenseTensor* dst) {
+void ReMakePtenDenseTensorBase(const paddle::framework::Tensor& src,
+                               pten::DenseTensor* dst) {
   VLOG(3) << "ReMakePtenDenseTensor based Tensor.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
@@ -361,17 +360,17 @@ void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
   shared_storage->ResetAllocation(src.Holder());
 }
 
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            pten::DenseTensor* dst) {
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src),
-                        dst);
+  ReMakePtenDenseTensorBase(static_cast<const paddle::framework::Tensor&>(src),
+                            dst);
 }
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst) {
+void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
+                                       const pten::TensorArgDef& arg_def,
+                                       pten::DenseTensor* dst) {
   VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   meta->dims = src.dims();
@@ -395,12 +394,12 @@ void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
   }
 }
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
                                    const pten::TensorArgDef& arg_def,
                                    pten::DenseTensor* dst) {
   auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
   SetLoD(&meta->lod, src.lod());
-  ReMakePtenDenseTensorByArgDef(
+  ReMakePtenDenseTensorByArgDefBase(
       static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
 }
 
diff --git a/paddle/pten/api/lib/utils/tensor_utils.h b/paddle/pten/api/lib/utils/tensor_utils.h
index 06edb4a7516b0..0ac4ac7a33179 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.h
+++ b/paddle/pten/api/lib/utils/tensor_utils.h
@@ -33,12 +33,9 @@ namespace experimental {
 std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
     const paddle::framework::Tensor& src);
 
-std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
-    const paddle::framework::LoDTensor& src);
-
-pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src);
+pten::Scalar MakePtenScalar(const paddle::framework::Tensor& src);
 
-pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src);
+pten::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src);
 
 pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable);
 
@@ -56,12 +53,8 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
 
 void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
-
 void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
-
 /**
  * In order to improve the compatibility state performance, some tricky tool
  * functions are added.
@@ -74,17 +67,10 @@ void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
                            pten::DenseTensor* dst);
 
-void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
-                           pten::DenseTensor* dst);
-
 void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
                                    const pten::TensorArgDef& arg_def,
                                    pten::DenseTensor* dst);
 
-void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
-                                   const pten::TensorArgDef& arg_def,
-                                   pten::DenseTensor* dst);
-
 void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
                                   const pten::TensorArgDef& arg_def,
                                   pten::DenseTensor* dst);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ec589b40e907f..0339abe0960c2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -32,6 +32,10 @@
     except Exception as e:
         raise e
 
+# Patch LoDTensor
+from . import core
+core.LoDTensor = core.Tensor
+
 # import all class inside framework into fluid module
 from . import framework
 from .framework import *
@@ -69,6 +73,7 @@
 from . import distribute_lookup_table
 from .param_attr import ParamAttr, WeightNormParamAttr
 from .data_feeder import DataFeeder
+
 from .core import LoDTensor, LoDTensorArray, Scope, _Scope
 from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace, MLUPlace
 from .incubate import fleet

From 1053b1d5ed04f411db50e66848210d9f1996bde4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 15 Jan 2022 14:52:58 +0800
Subject: [PATCH 07/10] replace last contextT (#38971)

---
 paddle/pten/kernels/gpu/scale_kernel.cu |  4 ++--
 paddle/pten/kernels/math_kernel.h       | 24 ++++++++++++------------
 paddle/pten/kernels/scale_kernel.h      |  8 ++++----
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/pten/kernels/gpu/scale_kernel.cu b/paddle/pten/kernels/gpu/scale_kernel.cu
index 4d63701413cd6..14ee75e4f9130 100644
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -43,8 +43,8 @@ struct ScaleFunctor {
   }
 };
 
-template <typename T, typename ContextT>
-void ScaleKernel(const ContextT& dev_ctx,
+template <typename T, typename Context>
+void ScaleKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  const Scalar& scale,
                  float bias,
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index e01103fc5b847..65c0f84e696de 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -67,8 +67,8 @@ void SumKernel(const Context& dev_ctx,
                DataType out_dtype,
                DenseTensor* out);
 
-template <typename T, typename ContextT>
-DenseTensor Add(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
                 const DenseTensor& y,
                 int axis) {
@@ -77,12 +77,12 @@ DenseTensor Add(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  AddKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  AddKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Subtract(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      int axis) {
@@ -91,12 +91,12 @@ DenseTensor Subtract(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  SubtractKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  SubtractKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Divide(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
                    const DenseTensor& y,
                    int axis) {
@@ -105,12 +105,12 @@ DenseTensor Divide(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  DivideKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  DivideKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Multiply(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
                      const DenseTensor& y,
                      int axis) {
@@ -119,7 +119,7 @@ DenseTensor Multiply(const ContextT& dev_ctx,
       pten::make_intrusive<paddle::experimental::SharedStorage>(
           dev_ctx.GetPlace()),
       std::move(out_meta));
-  MultiplyKernel<T, ContextT>(dev_ctx, x, y, axis, &dense_out);
+  MultiplyKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/kernels/scale_kernel.h b/paddle/pten/kernels/scale_kernel.h
index ba16db566b8bb..1cd11f0b8788f 100644
--- a/paddle/pten/kernels/scale_kernel.h
+++ b/paddle/pten/kernels/scale_kernel.h
@@ -28,15 +28,15 @@ void ScaleKernel(const Context& dev_ctx,
                  bool bias_after_scale,
                  DenseTensor* out);
 
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
+template <typename T, typename Context>
+DenseTensor Scale(const Context& dev_ctx,
                   const DenseTensor& x,
                   const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
   auto out_meta = UnchangedInferMeta(x.meta());
-  auto dense_out = pten::Empty<T, ContextT>(dev_ctx, std::move(out_meta));
-  ScaleKernel<T, ContextT>(
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ScaleKernel<T, Context>(
       dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
   return dense_out;
 }

From 35d2b71ab531b7b34c42576da49651ba7282300f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 15 Jan 2022 15:44:48 +0800
Subject: [PATCH 08/10] [PTen] Remove cached kernel context (#38953)

* remove cached kernel context

* revert dataloader format change
---
 .../framework/new_executor/interpretercore.cc |   9 +-
 .../new_executor/interpretercore_util.cc      |  11 +-
 .../new_executor/new_executor_defs.cc         |   4 -
 .../new_executor/new_executor_defs.h          |   5 +-
 paddle/fluid/framework/operator.cc            | 122 +++++-------------
 paddle/fluid/framework/operator.h             |  13 +-
 paddle/fluid/imperative/layer.cc              |  15 +--
 paddle/fluid/imperative/op_base.h             |   5 -
 paddle/fluid/imperative/prepared_operator.cc  | 100 ++++----------
 paddle/fluid/imperative/prepared_operator.h   |  13 +-
 paddle/fluid/imperative/tracer.cc             |   2 -
 .../fluid/dataloader/dataloader_iter.py       |  19 ---
 12 files changed, 82 insertions(+), 236 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 950756c0394a5..aea9ad2035396 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -418,15 +418,16 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
         VLOG(4) << "Run pten kernel: " << op->Type();
         VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                 << &instr_node.DeviceContext();
+        pten::KernelContext pt_kernel_context;
         op_with_kernel->BuildPtenKernelContext(
             *instr_node.InnerRuntimeContext().get(),
-            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()));
+            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
+            &pt_kernel_context);
 
-        (*instr_node.PtenKernel())(instr_node.PtenKernelContext());
+        (*instr_node.PtenKernel())(&pt_kernel_context);
 
         op_with_kernel->WriteBackToOutputs(
-            instr_node.InnerRuntimeContext().get());
-        instr_node.PtenKernelContext()->ClearData();
+            instr_node.InnerRuntimeContext().get(), &pt_kernel_context);
       } else {
         instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
       }
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 7ced4853c2d8f..214a1d728266b 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -425,13 +425,14 @@ void build_op_func_list(const platform::Place& place,
       }
 
       if (run_pten_kernel) {
-        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx);
+        pten::KernelContext pt_kernel_context;
+        op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx,
+                                               &pt_kernel_context);
         op_func_node.pt_kernel_ = op_with_kernel->PtenKernel();
-        op_func_node.pt_kernel_context_ = op_with_kernel->PtenKernelContext();
 
-        (*op_func_node.pt_kernel_)(op_func_node.pt_kernel_context_);
-        op_with_kernel->WriteBackToOutputs(&runtime_context);
-        op_func_node.pt_kernel_context_->ClearData();
+        (*op_func_node.pt_kernel_)(&pt_kernel_context);
+        op_with_kernel->WriteBackToOutputs(&runtime_context,
+                                           &pt_kernel_context);
       } else {
         op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
         op_func_node.kernel_func_(exec_ctx);
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 654746794da4e..fb29e18887b4e 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -688,10 +688,6 @@ pten::Kernel* Instruction::PtenKernel() const {
   return op_func_node_.pt_kernel_;
 }
 
-pten::KernelContext* Instruction::PtenKernelContext() const {
-  return op_func_node_.pt_kernel_context_;
-}
-
 OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
 
 OperatorBase* Instruction::OpBase() const {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 5d63eb33d424b..0ef85a25a237b 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -299,8 +299,7 @@ struct OpFuncNode {
   platform::DeviceContext* dev_ctx_;  // not owned
 
   // fit for pten kernel
-  pten::Kernel* pt_kernel_{nullptr};                 // not owned
-  pten::KernelContext* pt_kernel_context_{nullptr};  // not onwed
+  pten::Kernel* pt_kernel_{nullptr};  // not owned
 
   OpFuncType type_;
 };
@@ -322,8 +321,6 @@ class Instruction {
 
   pten::Kernel* PtenKernel() const;
 
-  pten::KernelContext* PtenKernelContext() const;
-
   OpFuncType KernelType() const;
 
   OperatorBase* OpBase() const;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index aa21c8eed256b..ff12edb72c06a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1192,13 +1192,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
     if (run_pten_kernel_) {
-      if (pt_kernel_context_ == nullptr) {
-        pt_kernel_context_.reset(new pten::KernelContext());
-      }
-      BuildPtenKernelContext(*runtime_ctx, dev_ctx);
-      (*pt_kernel_)(pt_kernel_context_.get());
-      WriteBackToOutputs(runtime_ctx);
-      pt_kernel_context_->ClearData();
+      pten::KernelContext pt_kernel_context;
+      BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context);
+      (*pt_kernel_)(&pt_kernel_context);
+      WriteBackToOutputs(runtime_ctx, &pt_kernel_context);
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
@@ -1791,18 +1788,9 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
 }
 
 void OperatorWithKernel::BuildPtenKernelContext(
-    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx) const {
-  if (pt_kernel_context_ == nullptr) {
-    pt_kernel_context_.reset(new pten::KernelContext());
-  }
-  // TODO(chenweihang): now only work for very simple case,
-  // many cases need to be deal with later:
-  // 1. the input and output are not tensor
-  // 2. the dispensbale, duplicable input and output
-  // 3. needless attributes remove
-  // 4. use pt Tensor directly
-  // 5. kernel input is not DenseTensor
-  pt_kernel_context_->SetDeviceContext(dev_ctx);
+    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
+    pten::KernelContext* pt_kernel_context) const {
+  pt_kernel_context->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature_->args);
   auto& attr_names = std::get<1>(pt_kernel_signature_->args);
@@ -1836,33 +1824,14 @@ void OperatorWithKernel::BuildPtenKernelContext(
 
     // calcute the start and end index of the input tensors
     size_t start_idx =
-        (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
+        (i == 0 ? 0 : pt_kernel_context->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-    auto current_vector_size = pt_kernel_context_->InputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto& input_ptr =
-            pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
-        if (input_ptr == nullptr) {
-          input_ptr = experimental::MakePtenTensorBaseFromVar(
-              *ins_vector[offset], in_def);
-        } else {
-          experimental::ReMakePtenDenseTensorFromVar(
-              *ins_vector[offset], in_def,
-              pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
-                                                                    offset));
-        }
-      } else {
-        pt_kernel_context_->EmplaceBackInputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
-                                                    in_def));
-      }
+      pt_kernel_context->EmplaceBackInputWithoutSetRange(
+          experimental::MakePtenTensorBaseFromVar(*ins_vector[offset], in_def));
     }
-    pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
+    pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1870,43 +1839,24 @@ void OperatorWithKernel::BuildPtenKernelContext(
     auto& outs_vector = ctx.outputs.at(output_names[i]);
 
     size_t start_idx =
-        (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
+        (i == 0 ? 0 : pt_kernel_context->OutputRangeAt(i - 1).second);
     size_t end_idx = start_idx + outs_vector.size();
-    auto current_vector_size = pt_kernel_context_->OutputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto* buffer_tensor =
-            pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
-                                                                   offset);
-        if (buffer_tensor) {
-          experimental::ReMakePtenDenseTensorFromVar(outs_vector[offset],
-                                                     out_def, buffer_tensor);
-        }
-      } else {
-        pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
-                                                    out_def));
-      }
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange(
+          experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
+                                                  out_def));
     }
 
     // Deal with the case that some outputs are NULL when run the kernel.
     // For example : the outputs of matmul_grad are dx and dy,
     // sometimes dx or dy may be NULL.
     if (outs_vector.empty()) {
-      if (current_vector_size > start_idx) {
-        pt_kernel_context_->SetOutputWithoutSetRange(start_idx, {nullptr});
-      } else {
-        pt_kernel_context_->EmplaceBackOutputWithoutSetRange({nullptr});
-      }
+      pt_kernel_context->EmplaceBackOutputWithoutSetRange({nullptr});
       end_idx = start_idx + 1;
     }
 
-    pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
-                                          i);
+    pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
@@ -1915,11 +1865,11 @@ void OperatorWithKernel::BuildPtenKernelContext(
       if (attr_iter != Attrs().end()) {  // shape is in the attribute
         if (std::type_index(attr_iter->second.type()) ==
             std::type_index(typeid(std::vector<int64_t>))) {
-          pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray(
+          pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray(
               BOOST_GET_CONST(std::vector<int64_t>, attr_iter->second))));
         } else if (std::type_index(attr_iter->second.type()) ==
                    std::type_index(typeid(std::vector<int32_t>))) {
-          pt_kernel_context_->EmplaceBackAttr(std::move(pten::ScalarArray(
+          pt_kernel_context->EmplaceBackAttr(std::move(pten::ScalarArray(
               BOOST_GET_CONST(std::vector<int32_t>, attr_iter->second))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
@@ -1930,10 +1880,10 @@ void OperatorWithKernel::BuildPtenKernelContext(
       } else {  // shape is in the input
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
         if (ins_vector.size() == 1) {  // ShapeTensor
-          pt_kernel_context_->EmplaceBackAttr(std::move(
+          pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePtenScalarArrayFromVar(*ins_vector.front())));
         } else {  // ShapeTensorList
-          pt_kernel_context_->EmplaceBackAttr(std::move(
+          pt_kernel_context->EmplaceBackAttr(std::move(
               experimental::MakePtenScalarArrayFromVarList(ins_vector)));
         }
       }
@@ -1946,11 +1896,11 @@ void OperatorWithKernel::BuildPtenKernelContext(
       if (attr_iter != Attrs().end()) {  // scalar is in the attribute
         auto& attr = Attrs().at(attr_names[i]);
         if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-          pt_kernel_context_->EmplaceBackAttr(
+          pt_kernel_context->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
         } else if (std::type_index(attr.type()) ==
                    std::type_index(typeid(std::string))) {
-          pt_kernel_context_->EmplaceBackAttr(
+          pt_kernel_context->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
@@ -1960,7 +1910,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
         }
       } else {
         auto& ins_vector = ctx.inputs.at(attr_names[i]);
-        pt_kernel_context_->EmplaceBackAttr(std::move(
+        pt_kernel_context->EmplaceBackAttr(std::move(
             experimental::MakePtenScalarFromVar(*ins_vector.front())));
       }
 
@@ -1968,17 +1918,17 @@ void OperatorWithKernel::BuildPtenKernelContext(
       // TODO(chenweihang): support other attrs later
       auto& attr = Attrs().at(attr_names[i]);
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        pt_kernel_context_->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
+        pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(pten::DataType))) {
         auto data_type = pten::TransToPtenDataType(
             static_cast<framework::proto::VarType::Type>(
                 BOOST_GET_CONST(int, attr)));
-        pt_kernel_context_->EmplaceBackAttr(data_type);
+        pt_kernel_context->EmplaceBackAttr(data_type);
       } else if (attr_defs[i].type_index ==
                  std::type_index(typeid(std::vector<int64_t>))) {
         if (std::type_index(attr.type()) ==
@@ -1987,7 +1937,7 @@ void OperatorWithKernel::BuildPtenKernelContext(
           const auto& vector_int_attr = BOOST_GET_CONST(std::vector<int>, attr);
           const std::vector<int64_t> vector_int64_attr(vector_int_attr.begin(),
                                                        vector_int_attr.end());
-          pt_kernel_context_->EmplaceBackAttr(vector_int64_attr);
+          pt_kernel_context->EmplaceBackAttr(vector_int64_attr);
         }
         // TODO(YuanRisheng) Need support vector<int64_t> attr
 
@@ -2001,20 +1951,16 @@ void OperatorWithKernel::BuildPtenKernelContext(
   }
 }
 
-void OperatorWithKernel::WriteBackToOutputs(RuntimeContext* ctx) const {
-  // auto& input_names = std::get<0>(pt_kernel_signature_->args);
-  // auto& attr_names = std::get<1>(pt_kernel_signature_->args);
+void OperatorWithKernel::WriteBackToOutputs(
+    RuntimeContext* ctx, pten::KernelContext* pt_kernel_context) const {
   auto& output_names = std::get<2>(pt_kernel_signature_->args);
 
-  // pt_kernel_context_
-
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto& outs_vector = ctx->outputs.at(output_names[i]);
 
-    auto& range_pair = pt_kernel_context_->OutputRangeAt(i);
-    auto pten_outs =
-        pt_kernel_context_->MutableOutputBetween<pten::DenseTensor>(
-            range_pair.first, range_pair.second);
+    auto& range_pair = pt_kernel_context->OutputRangeAt(i);
+    auto pten_outs = pt_kernel_context->MutableOutputBetween<pten::DenseTensor>(
+        range_pair.first, range_pair.second);
 
     for (size_t j = 0; j < pten_outs.size(); ++j) {
       if (pten_outs[j]) {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 12946b416cf9f..3aab9165eae0a 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -589,16 +589,14 @@ class OperatorWithKernel : public OperatorBase {
   void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
   void BuildPtenKernelContext(const RuntimeContext& ctx,
-                              platform::DeviceContext* dev_ctx) const;
+                              platform::DeviceContext* dev_ctx,
+                              pten::KernelContext* pt_kernel_context) const;
 
-  void WriteBackToOutputs(RuntimeContext* ctx) const;
+  void WriteBackToOutputs(RuntimeContext* ctx,
+                          pten::KernelContext* pt_kernel_context) const;
 
   pten::Kernel* PtenKernel() const { return pt_kernel_.get(); }
 
-  pten::KernelContext* PtenKernelContext() const {
-    return pt_kernel_context_.get();
-  }
-
   const OpKernelType* kernel_type() const { return kernel_type_.get(); }
 
  private:
@@ -657,9 +655,6 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool run_pten_kernel_ = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  mutable std::unique_ptr<pten::KernelContext> pt_kernel_context_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index d8ee400e35082..cc7fcf455a13d 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -409,8 +409,6 @@ void VarBase::_CopyGradientFrom(const VarBase& src) {
   }
 }
 
-pten::KernelContext OpBase::pt_kernel_context_;
-
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
@@ -426,8 +424,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
                           const NameVarMap<VarType>& outs,
                           const framework::AttributeMap& attrs,
                           const framework::AttributeMap& default_attrs,
-                          const platform::Place& place,
-                          pten::KernelContext* pt_kernel_context) {
+                          const platform::Place& place) {
   auto* op_kernel = dynamic_cast<const framework::OperatorWithKernel*>(&op);
   PADDLE_ENFORCE_NOT_NULL(
       op_kernel, platform::errors::PermissionDenied(
@@ -468,8 +465,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * after the execution of op, but the original input is directly
    * overwritten in the previous dynamic graph implemention.
    */
-  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs,
-                                         default_attrs, pt_kernel_context);
+  auto prepared_op =
+      PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs);
   auto tmp_ins_ptr =
       PrepareData<VarType>(*op_kernel, ins, prepared_op.kernel_type());
   if (tmp_ins_ptr == nullptr) {
@@ -497,8 +494,7 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place,
-                         &pt_kernel_context_);
+  OpBaseRunImpl<VarBase>(op, ins, outs, attrs, default_attrs, place);
 }
 
 void OpBase::Run(const framework::OperatorBase& op,
@@ -507,8 +503,7 @@ void OpBase::Run(const framework::OperatorBase& op,
                  const framework::AttributeMap& attrs,
                  const framework::AttributeMap& default_attrs,
                  const platform::Place& place) {
-  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place,
-                                 &pt_kernel_context_);
+  OpBaseRunImpl<VariableWrapper>(op, ins, outs, attrs, default_attrs, place);
 }
 
 void ClearNoNeedBufferInputs(OpBase* op) {
diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h
index cb76a82353282..3d0847605566b 100644
--- a/paddle/fluid/imperative/op_base.h
+++ b/paddle/fluid/imperative/op_base.h
@@ -183,8 +183,6 @@ class OpBase {
                   const framework::AttributeMap& default_attrs,
                   const platform::Place& place);
 
-  static pten::KernelContext* GetKernelContext() { return &pt_kernel_context_; }
-
   bool HasVoidFunctionPostHook() const {
     return !void_function_post_hooks_.empty();
   }
@@ -212,9 +210,6 @@ class OpBase {
   std::unique_ptr<framework::OperatorBase> op_;
   platform::Place place_;
   size_t id_{-1UL};
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  static pten::KernelContext pt_kernel_context_;
   std::vector<std::shared_ptr<std::function<void()>>> void_function_post_hooks_;
 };
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 46e974c8f43f3..15a278c2e6464 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -117,7 +117,6 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::OpKernelType& kernel_type,
                        const framework::KernelSignature& kernel_signature,
                        const pten::Kernel& pt_kernel,
-                       pten::KernelContext* pt_kernel_context,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
@@ -126,8 +125,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       dev_ctx_(dev_ctx),
       run_pten_kernel_(true),
       pt_kernel_signature_(kernel_signature),
-      pt_kernel_(pt_kernel),
-      pt_kernel_context_(pt_kernel_context) {}
+      pt_kernel_(pt_kernel) {}
 
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
@@ -135,8 +133,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const framework::OperatorWithKernel& op,
                        const platform::Place& place,
                        const framework::AttributeMap& attrs,
-                       const framework::AttributeMap& default_attrs,
-                       pten::KernelContext* pt_kernel_context) {
+                       const framework::AttributeMap& default_attrs) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -178,7 +175,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
-                        pt_kernel, pt_kernel_context, dev_ctx);
+                        pt_kernel, dev_ctx);
     } else {
       VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
               << "` not found.";
@@ -247,10 +244,8 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs,
-                               pten::KernelContext* pt_kernel_context) {
-  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs,
-                              pt_kernel_context);
+                               const framework::AttributeMap& default_attrs) {
+  return PrepareImpl<VarBase>(ins, outs, op, place, attrs, default_attrs);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
@@ -258,10 +253,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                const framework::OperatorWithKernel& op,
                                const platform::Place& place,
                                const framework::AttributeMap& attrs,
-                               const framework::AttributeMap& default_attrs,
-                               pten::KernelContext* pt_kernel_context) {
+                               const framework::AttributeMap& default_attrs) {
   return PrepareImpl<VariableWrapper>(ins, outs, op, place, attrs,
-                                      default_attrs, pt_kernel_context);
+                                      default_attrs);
 }
 
 template <typename VarType>
@@ -271,13 +265,6 @@ static void BuildDygraphPtenKernelContext(
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
     platform::DeviceContext* dev_ctx, pten::KernelContext* kernel_ctx) {
-  // TODO(chenweihang): now only work for very simple case,
-  // many cases need to be deal with later:
-  // 1. the input and output are not tensor
-  // 2. the dispensbale, duplicable input and output
-  // 3. needless attributes remove
-  // 4. use pt Tensor directly
-  // 5. kernel input is not DenseTensor
   kernel_ctx->SetDeviceContext(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature.args);
@@ -312,26 +299,11 @@ static void BuildDygraphPtenKernelContext(
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
     size_t end_idx = start_idx + ins_vector.size();
-    auto current_vector_size = kernel_ctx->InputsSize();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
       const auto& variable = ins_vector[offset]->Var();
-      if (current_vector_size > start_idx + offset) {
-        auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
-        if (input_ptr == nullptr) {
-          input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
-        } else {
-          experimental::ReMakePtenDenseTensorFromVar(
-              variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
-                                    start_idx + offset));
-        }
-      } else {
-        kernel_ctx->EmplaceBackInputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(variable, in_def));
-      }
+      kernel_ctx->EmplaceBackInputWithoutSetRange(
+          paddle::experimental::MakePtenTensorBaseFromVar(variable, in_def));
     }
     kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -340,15 +312,10 @@ static void BuildDygraphPtenKernelContext(
     auto& out_def = output_defs.at(i);
 
     size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
-    auto current_vector_size = kernel_ctx->OutputsSize();
 
     auto iter = outs.find(output_names[i]);
     if (iter == outs.end()) {
-      if (current_vector_size > start_idx) {
-        kernel_ctx->SetOutputWithoutSetRange(start_idx, {nullptr});
-      } else {
-        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
-      }
+      kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
       kernel_ctx->AssignOutputRange(std::make_pair(start_idx, start_idx + 1),
                                     i);
       continue;
@@ -357,27 +324,10 @@ static void BuildDygraphPtenKernelContext(
     auto& outs_vector = iter->second;
     size_t end_idx = start_idx + outs_vector.size();
 
-    // If the memory needed is less than the current memory allocated, we will
-    // reuse the current memory by using ReMakePtenDenseTensorFromVar.
-    // Otherwise，we will create new storage.
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
-      if (current_vector_size > start_idx + offset) {
-        auto* buffer_tensor =
-            kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset);
-        if (buffer_tensor) {
-          experimental::ReMakePtenDenseTensorFromVar(
-              outs_vector[offset]->MutableVar(), out_def, buffer_tensor);
-        } else {
-          kernel_ctx->SetOutputWithoutSetRange(
-              start_idx + offset,
-              experimental::MakePtenTensorBaseFromVar(
-                  outs_vector[offset]->MutableVar(), out_def));
-        }
-      } else {
-        kernel_ctx->EmplaceBackOutputWithoutSetRange(
-            experimental::MakePtenTensorBaseFromVar(
-                outs_vector[offset]->MutableVar(), out_def));
-      }
+      kernel_ctx->EmplaceBackOutputWithoutSetRange(
+          paddle::experimental::MakePtenTensorBaseFromVar(
+              outs_vector[offset]->MutableVar(), out_def));
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
@@ -556,19 +506,20 @@ static void PreparedOpRunPtImpl(
     const framework::OperatorBase& op,
     const framework::OpKernelType& kernel_type,
     const framework::KernelSignature& pt_kernel_signature,
-    const pten::Kernel& pt_kernel, pten::KernelContext* pt_kernel_context,
-    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
-    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
+    const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(
       &ins, &outs, &attrs, &default_attrs, op.Type(), &kernel_type);
   op.Info().infer_shape_(&infer_shape_ctx);
 
+  pten::KernelContext pt_kernel_context;
   BuildDygraphPtenKernelContext<VarType>(pt_kernel_signature, pt_kernel, ins,
                                          outs, attrs, default_attrs, dev_ctx,
-                                         pt_kernel_context);
+                                         &pt_kernel_context);
 
-  pt_kernel(pt_kernel_context);
+  pt_kernel(&pt_kernel_context);
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
@@ -578,10 +529,7 @@ static void PreparedOpRunPtImpl(
 #endif
   }
 
-  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, pt_kernel_context);
-
-  // Ensure that it does not affect the VarBase life cycle management
-  pt_kernel_context->ClearData();
+  WriteBackToOutputs<VarType>(pt_kernel_signature, outs, &pt_kernel_context);
 
   // TODO(chenweihang): add debug flags later
   if (framework::IsComplexType(kernel_type.data_type_)) {
@@ -595,8 +543,8 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, kernel_type_, pt_kernel_signature_,
-                                 pt_kernel_, pt_kernel_context_, dev_ctx_, ins,
-                                 outs, attrs, default_attrs);
+                                 pt_kernel_, dev_ctx_, ins, outs, attrs,
+                                 default_attrs);
   } else {
     PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
                                outs, attrs, default_attrs);
@@ -609,8 +557,8 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& default_attrs) {
   if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(
-        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, pt_kernel_context_,
-        dev_ctx_, ins, outs, attrs, default_attrs);
+        op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins,
+        outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
                                        ins, outs, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 29747e79ef6fa..22f016e2cadc1 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -153,25 +153,21 @@ class PreparedOp {
              const framework::RuntimeContext& ctx,
              const framework::OpKernelType& kernel_type,
              const framework::KernelSignature& kernel_signature,
-             const pten::Kernel& pt_kernel,
-             pten::KernelContext* pt_kernel_context,
-             platform::DeviceContext* dev_ctx);
+             const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs,
-                            pten::KernelContext* pt_kernel_context = nullptr);
+                            const framework::AttributeMap& default_attrs);
 
   static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
                             const NameVarMap<VariableWrapper>& outs,
                             const framework::OperatorWithKernel& op,
                             const platform::Place& place,
                             const framework::AttributeMap& attrs,
-                            const framework::AttributeMap& default_attrs,
-                            pten::KernelContext* pt_kernel_context = nullptr);
+                            const framework::AttributeMap& default_attrs);
 
   void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
            const framework::AttributeMap& attrs,
@@ -196,9 +192,6 @@ class PreparedOp {
   bool run_pten_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
-  // In order to reduce the compatibility phase
-  // performance overhead, temporarily cache KernelContext
-  pten::KernelContext* pt_kernel_context_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 682916a9b323b..7ed9f08906a73 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -231,8 +231,6 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
     OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place);
   } catch (platform::EnforceNotMet& exception) {
     framework::AppendErrorOpHint(type, &exception);
-    // Compatible impl: clear pten kernel context data when throw error
-    OpBase::GetKernelContext()->ClearData();
     throw std::move(exception);
   } catch (std::exception& ex) {
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 10a9358612960..a3e6ea6d1bc78 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -202,22 +202,6 @@ def _thread_loop(self, legacy_expected_place):
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
-        # NOTE(chenweihang): [ Why need to set not to execute pten kernel here? ]
-        # Now, in order to ensure that the execution performance of the dynamic
-        # graph mode in pten compatible state does not decline significantly,
-        # we have adopted the approach of caching a KernelContext globally for
-        # the dynamic graph tracer to reduce the construction and deconstruction
-        # overhead of data interfaces such as the compatible state DenseTensor.
-        # The static graph is each op caches a KernelContext, but the op of
-        # the dynamic graph will be constructed and destroyed every round of
-        # execution, so it is impossible to cache KernelContext for each op.
-        # However, it is not thread-safe if using only one global kernel context in
-        # dynamic graph. If the pten op of paddle is used in the DataLoader thread,
-        # it may cause access errors. We temporarily do not execute pten kernel
-        # in this scenario and will find a better solution later and remove
-        # this setting.
-        set_flags({'FLAGS_run_pten_kernel': False})
-
         while not self._thread_done_event.is_set():
             try:
                 indices = next(self._sampler_iter)
@@ -519,9 +503,6 @@ def _thread_loop(self, legacy_expected_place):
         # APIs in this thread.
         _set_expected_place(legacy_expected_place)
 
-        # NOTE(chenweihang): See Note [ Why need to set not to execute pten kernel here? ]
-        set_flags({'FLAGS_run_pten_kernel': False})
-
         while not self._thread_done_event.is_set():
             batch = self._get_data()
             if not self._thread_done_event.is_set():

From d13c779900b2cdab89d21e57f87ec571b8a441e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 15 Jan 2022 20:03:54 +0800
Subject: [PATCH 09/10] isolates friends of storage, test=develop (#38977)

---
 paddle/pten/api/lib/utils/tensor_utils.cc | 45 +++++------------------
 paddle/pten/core/compat_utils.h           |  9 +----
 2 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index edd5cde938630..f304268bedf45 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -325,9 +325,7 @@ void SharesStorageBase(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
       platform::errors::InvalidArgument(
           "The destination Tensor is nullptr when move allocation."));
   dst->Resize(src->dims());
-  auto* storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-  dst->ResetHolderWithType(storage->GetAllocation(),
+  dst->ResetHolderWithType(src->Holder(),
                            pten::TransToProtoVarType(src->dtype()));
   dst->set_offset(src->meta().offset);
 }
@@ -345,19 +343,7 @@ void ReMakePtenDenseTensorBase(const paddle::framework::Tensor& src,
   meta->dtype = pten::TransToPtenDataType(src.type());
   meta->layout = src.layout();
   meta->offset = src.offset();
-
-  auto* shared_storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
-  PADDLE_ENFORCE_NOT_NULL(
-      shared_storage,
-      platform::errors::NotFound(
-          "Target DenseTensor's shared storage is nullptr."));
-
-  PADDLE_ENFORCE_EQ(src.IsInitialized(),
-                    true,
-                    paddle::platform::errors::InvalidArgument(
-                        "Source Tensor is not initialized."));
-  shared_storage->ResetAllocation(src.Holder());
+  dst->ResetHolder(src.Holder());
 }
 
 void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
@@ -378,19 +364,12 @@ void ReMakePtenDenseTensorByArgDefBase(const paddle::framework::Tensor& src,
   meta->layout = src.layout();
   meta->offset = src.offset();
 
-  auto* shared_storage = static_cast<SharedStorage*>(
-      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
-  PADDLE_ENFORCE_NOT_NULL(
-      shared_storage,
-      platform::errors::NotFound(
-          "Target DenseTensor's shared storage is nullptr."));
-
   if (src.IsInitialized() &&
       src.place() == pten::TransToFluidPlace(arg_def.backend)) {
-    shared_storage->ResetAllocation(src.Holder());
+    dst->ResetHolder(src.Holder());
   } else {
-    shared_storage->ResetAllocationPlace(
-        pten::TransToFluidPlace(arg_def.backend));
+    // This does not affect the correctness, and will be modified immediately.
+    // dst->mutable_data(pten::TransToFluidPlace(arg_def.backend));
   }
 }
 
@@ -481,14 +460,10 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     tensor->Resize(src->dims());
     SetLoD(tensor->mutable_lod(), src->lod());
 
-    // here dynamic_cast is slow
-    auto* storage = static_cast<SharedStorage*>(
-        pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-
     if (!tensor->IsInitialized() ||
         (tensor->IsInitialized() &&
-         !IsSameAllocation(tensor->Holder(), storage->GetAllocation()))) {
-      tensor->ResetHolderWithType(std::move(storage->GetAllocation()), dtype);
+         !IsSameAllocation(tensor->Holder(), src->Holder()))) {
+      tensor->ResetHolderWithType(std::move(src->Holder()), dtype);
     } else {
       // Even the pten tensor and Variable have the same Alloctation (both have
       // the same pointer address, same size and same place)
@@ -502,10 +477,8 @@ void MakeVariableFromPtenTensor(pten::DenseTensor* src,
     auto dtype = pten::TransToProtoVarType(src->dtype());
 
     if (!tensor->value().IsInitialized()) {
-      auto storage = dynamic_cast<SharedStorage*>(
-          pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
-      tensor->mutable_value()->ResetHolderWithType(
-          std::move(storage->GetAllocation()), dtype);
+      tensor->mutable_value()->ResetHolderWithType(std::move(src->Holder()),
+                                                   dtype);
     }
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/pten/core/compat_utils.h b/paddle/pten/core/compat_utils.h
index 0bd82080ddebc..46e53e3997cc1 100644
--- a/paddle/pten/core/compat_utils.h
+++ b/paddle/pten/core/compat_utils.h
@@ -31,10 +31,6 @@ namespace pten {
 
 class CompatibleDenseTensorUtils {
  public:
-  static Storage* UnsafeGetMutableStorage(DenseTensor* tensor) {
-    return tensor->storage_.get();
-  }
-
   static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
     return &(tensor->meta_);
   }
@@ -42,10 +38,7 @@ class CompatibleDenseTensorUtils {
   // only can deal with SharedStorage now
   static void ClearStorage(DenseTensor* tensor) {
     // use static_cast to improve performance, replace by dynamic_cast later
-    if (tensor->storage_ != nullptr) {
-      static_cast<paddle::experimental::SharedStorage*>(tensor->storage_.get())
-          ->Reset();
-    }
+    tensor->MoveMemoryHolder();
   }
 
   static DenseTensor Slice(const DenseTensor& tensor,

From 5c3586746792056d86a72f114167103f98b3af29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Sat, 15 Jan 2022 21:58:35 +0800
Subject: [PATCH 10/10] updates the ctor of tensor, test=develop (#38946)

---
 .../accumulation_node_test.cc                 |  15 ++-
 .../autograd_meta_test.cc                     |   5 +-
 .../data_structure_tests/eager_tensor_test.cc |  10 +-
 .../grad_node_info_test.cc                    |  10 +-
 .../data_structure_tests/grad_node_test.h     |   5 +-
 .../grad_tensor_holder_test.cc                |  15 ++-
 .../tensor_wrapper_test.cc                    |  10 +-
 .../tests/task_tests/eager_utils_test.cc      |  15 ++-
 paddle/pten/api/lib/utils/allocator.h         |  16 +--
 paddle/pten/core/dense_tensor.cc              |   6 +-
 paddle/pten/core/dense_tensor.h               |   6 +-
 paddle/pten/core/storage.cc                   |   2 +-
 paddle/pten/core/storage.h                    |  19 +--
 paddle/pten/tests/api/CMakeLists.txt          |   2 -
 paddle/pten/tests/api/test_cast_api.cc        |   4 +-
 paddle/pten/tests/api/test_conj_api.cc        |   4 +-
 paddle/pten/tests/api/test_dot_api.cc         |   6 +-
 paddle/pten/tests/api/test_elementwise_api.cc |  24 ++--
 paddle/pten/tests/api/test_empty_api.cc       |  12 +-
 paddle/pten/tests/api/test_fill_api.cc        |  22 ++--
 paddle/pten/tests/api/test_flatten_api.cc     |   4 +-
 paddle/pten/tests/api/test_matmul_api.cc      |  20 +--
 paddle/pten/tests/api/test_mean_api.cc        |   4 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   4 +-
 paddle/pten/tests/api/test_storage.cc         |  65 ---------
 paddle/pten/tests/api/test_sum_api.cc         |   4 +-
 paddle/pten/tests/api/test_tensor_utils.cc    | 124 ------------------
 paddle/pten/tests/api/test_to_api.cc          |   4 +-
 paddle/pten/tests/core/CMakeLists.txt         |   2 -
 paddle/pten/tests/core/allocator.h            |  67 +---------
 paddle/pten/tests/core/test_allocator.cc      |  95 --------------
 paddle/pten/tests/core/test_dense_tensor.cc   |  13 +-
 paddle/pten/tests/core/test_storage.cc        |  40 ------
 .../pten/tests/kernels/test_cast_dev_api.cc   |   4 +-
 .../pten/tests/kernels/test_conj_dev_api.cc   |   4 +-
 .../pten/tests/kernels/test_copy_dev_api.cc   |   6 +-
 .../tests/kernels/test_creation_dev_api.cc    |   8 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |   6 +-
 .../tests/kernels/test_elementwise_dev_api.cc |  24 ++--
 .../tests/kernels/test_flatten_dev_api.cc     |   4 +-
 .../pten/tests/kernels/test_matmul_dev_api.cc |   6 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |   4 +-
 .../tests/kernels/test_reshape_dev_api.cc     |   4 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |  13 +-
 paddle/pten/tests/kernels/test_sum_dev_api.cc |   4 +-
 45 files changed, 175 insertions(+), 566 deletions(-)
 delete mode 100644 paddle/pten/tests/api/test_storage.cc
 delete mode 100644 paddle/pten/tests/api/test_tensor_utils.cc
 delete mode 100644 paddle/pten/tests/core/test_allocator.cc
 delete mode 100644 paddle/pten/tests/core/test_storage.cc

diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
index f249d2099f24c..cdc9701009513 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc
@@ -32,15 +32,17 @@ TEST(AccumulationNode, EagerTensor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT16, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<paddle::platform::float16>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
 
   dt1->mutable_data<paddle::platform::float16>()[0] = 20.0;
@@ -48,8 +50,9 @@ TEST(AccumulationNode, EagerTensor) {
 
   std::shared_ptr<pten::DenseTensor> grad_dt =
       std::make_shared<pten::DenseTensor>(
-          std::make_shared<paddle::experimental::DefaultAllocator>(
-              paddle::platform::CPUPlace()),
+          std::make_unique<paddle::experimental::DefaultAllocator>(
+              paddle::platform::CPUPlace())
+              .get(),
           meta);
   EagerTensor grad_et = EagerTensor(grad_dt);
 
diff --git a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
index 96845569ca0c5..3d45dc831d411 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/autograd_meta_test.cc
@@ -42,8 +42,9 @@ TEST(AutogradMeta, MemberFunction) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index 84daf4eac4ce6..a483ddb6a98f6 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -36,8 +36,9 @@ TEST(EagerTensor, Constructor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -65,8 +66,9 @@ TEST(EagerTensor, MemberFunction) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index a89fb019d5b37..7f6609b88a527 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -41,8 +41,9 @@ TEST(GradNodeInfo, GradNodeBase) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -97,8 +98,9 @@ TEST(GradNodeInfo, GradNodeBase) {
     pten::DenseTensorMeta meta = pten::DenseTensorMeta(
         pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
     std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-        std::make_shared<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace()),
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
         meta);
     auto* dt_ptr = dt->mutable_data<float>();
     dt_ptr[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
index 2870bfa8b0c94..433a00e27be0e 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h
@@ -37,8 +37,9 @@ class GradTestNode : public egr::GradNodeBase {
     pten::DenseTensorMeta meta = pten::DenseTensorMeta(
         pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
     std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-        std::make_shared<paddle::experimental::DefaultAllocator>(
-            paddle::platform::CPUPlace()),
+        std::make_unique<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace())
+            .get(),
         meta);
     auto* dt_ptr = dt->mutable_data<float>();
     dt_ptr[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
index 3581ef59cd5be..c88a5f5fdcef5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc
@@ -36,8 +36,9 @@ TEST(GradTensorHolder, Constructor) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({2, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   EagerTensor et = EagerTensor(dt);
 
@@ -52,15 +53,17 @@ TEST(GradTensorHolder, Interfaces) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<float>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt1->mutable_data<float>()[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 6d78cf42d0c48..8bc739d455a95 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -25,8 +25,9 @@ TEST(TensorWrapper, Basic) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<float>();
   dt_ptr[0] = 5.0f;
@@ -51,8 +52,9 @@ TEST(TensorWrapper, Basic) {
   pten::DenseTensorMeta meta2 = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 2}));
   std::shared_ptr<pten::DenseTensor> dt2 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta2);
   auto* dt_ptr2 = dt->mutable_data<float>();
   dt_ptr2[0] = 6.0f;
diff --git a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
index ea9aae83ff189..1b2f1287b069d 100644
--- a/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
@@ -31,15 +31,17 @@ TEST(EagerUtils, AutoGradMeta) {
   pten::DenseTensorMeta meta = pten::DenseTensorMeta(
       pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
   std::shared_ptr<pten::DenseTensor> dt0 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt0->mutable_data<float>()[0] = 10.0;
   EagerTensor et0 = EagerTensor(dt0);
 
   std::shared_ptr<pten::DenseTensor> dt1 = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   dt1->mutable_data<float>()[0] = 20.0;
   EagerTensor et1 = EagerTensor(dt1);
@@ -106,8 +108,9 @@ egr::EagerTensor CreateTestCPUTensor(T val,
       pten::DenseTensorMeta(pten::DataType::FLOAT32, ddim);
   egr::EagerTensor tensor;
   std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          paddle::platform::CPUPlace()),
+      std::make_unique<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace())
+          .get(),
       meta);
   auto* dt_ptr = dt->mutable_data<T>();
   for (int64_t i = 0; i < dt->numel(); i++) {
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
index a8c05b7651689..acdba822ac4bb 100644
--- a/paddle/pten/api/lib/utils/allocator.h
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -22,25 +22,15 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-class DefaultAllocator : public pten::deprecated::Allocator {
+class DefaultAllocator : public pten::Allocator {
  public:
-  using Allocation = pten::deprecated::Allocation;
   explicit DefaultAllocator(const paddle::platform::Place& place)
       : place_(place) {}
 
-  static void Delete(Allocation* allocation) {
-    paddle::memory::allocation::Allocator::AllocationDeleter(
-        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+  AllocationPtr Allocate(size_t bytes_size) override {
+    return memory::Alloc(place_, bytes_size);
   }
 
-  Allocation Allocate(size_t bytes_size) override {
-    paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size);
-    void* ptr = a->ptr();
-    return Allocation(ptr, a.release(), &Delete, place_);
-  }
-
-  const paddle::platform::Place& place() override { return place_; }
-
  private:
   paddle::platform::Place place_;
 };
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index eb6f834d72779..716e1ac3d30bb 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -24,13 +24,11 @@ limitations under the License. */
 
 namespace pten {
 
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         const DenseTensorMeta& meta)
+DenseTensor::DenseTensor(Allocator* a, const DenseTensorMeta& meta)
     : meta_(meta),
       storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         DenseTensorMeta&& meta)
+DenseTensor::DenseTensor(Allocator* a, DenseTensorMeta&& meta)
     : meta_(std::move(meta)),
       storage_(make_intrusive<TensorStorage>(a, SizeOf(dtype()) * numel())) {}
 
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 4f25fc296724c..db8d7a2a39c90 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,17 +60,15 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                     public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
-  using Allocator = deprecated::Allocator;
-
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
+  DenseTensor(Allocator* a, const DenseTensorMeta& meta);
 
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
+  DenseTensor(Allocator* a, DenseTensorMeta&& meta);
 
   /// \brief Use existing storage space to create dense tensor. This interface
   /// can be used to deliberately create an uninitialized dense tensor.
diff --git a/paddle/pten/core/storage.cc b/paddle/pten/core/storage.cc
index f7c7f68734101..aacae7be88349 100644
--- a/paddle/pten/core/storage.cc
+++ b/paddle/pten/core/storage.cc
@@ -18,7 +18,7 @@ namespace pten {
 
 void TensorStorage::Realloc(size_t size) {
   this->Clear();
-  data_ = paddle::memory::AllocShared(alloc_->place(), size);
+  data_ = alloc_->Allocate(size);
   size_ = size;
 }
 
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index cf18dd913093a..97d7f8d0f1105 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,12 +91,11 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
  public:
   using Place = paddle::platform::Place;
-  using Allocator = deprecated::Allocator;
 
-  explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
+  explicit TensorStorage(Allocator* a) : alloc_(a) {}
 
-  TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
-      : Storage(paddle::memory::AllocShared(a->place(), size)), alloc_(a) {
+  TensorStorage(Allocator* a, size_t size)
+      : Storage(a->Allocate(size)), alloc_(a) {
     size_ = data_->size();
   }
 
@@ -114,24 +113,18 @@ class TensorStorage : public Storage {
   size_t size() const noexcept override { return size_; }
 
   const Place& place() const override {
-    if (!data_ && !alloc_) {
+    if (!data_) {
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unable to visit place: either data_ or alloc_ has to be initialized "
           "first."));
     }
-    if (data_) {
-      return data_->place();
-    }
-    return alloc_->place();
+    return data_->place();
   }
 
   bool OwnsMemory() const noexcept override { return true; }
-  const std::shared_ptr<Allocator>& allocator() const noexcept {
-    return alloc_;
-  }
 
  private:
-  const std::shared_ptr<Allocator> alloc_;
+  Allocator* alloc_;
   int64_t size_{0};
 };
 
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index ffbc551843148..79d9a3d82e69e 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -5,8 +5,6 @@ else()
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
-cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_api_utils)
-cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_api_utils)
 
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index c2660a1f80019..6608d1ed08cab 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, cast) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc
index 928f8e414fda0..50d190257a16d 100644
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, conj) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::COMPLEX64,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 41c03f8f26201..40e709b960334 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -30,17 +30,17 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index e5971aae5513f..69af32eb457a6 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -30,17 +30,17 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, add) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -84,17 +84,17 @@ TEST(API, add) {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, subtract) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -138,17 +138,17 @@ TEST(API, subtract) {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, divide) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
@@ -192,17 +192,17 @@ TEST(API, divide) {
 
 TEST(API, multiply) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({10}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
index fcc01ad8a7172..f4e3f472c7990 100644
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, empty_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -55,11 +55,11 @@ TEST(API, empty_like) {
 
 TEST(API, empty1) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_shape = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
@@ -83,11 +83,11 @@ TEST(API, empty1) {
 }
 
 TEST(API, empty2) {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index e87d094eec9d3..0d823765680e8 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -65,10 +65,10 @@ TEST(API, full_like) {
 
 TEST(API, zeros_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -98,10 +98,10 @@ TEST(API, zeros_like) {
 
 TEST(API, ones_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({3, 2}),
                             pten::DataLayout::NCHW));
@@ -131,11 +131,11 @@ TEST(API, ones_like) {
 
 TEST(API, full1) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_shape = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({2}),
                             pten::DataLayout::NCHW));
@@ -144,7 +144,7 @@ TEST(API, full1) {
   shape_data[1] = 3;
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
@@ -177,11 +177,11 @@ TEST(API, full1) {
 }
 
 TEST(API, full2) {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
 
   auto dense_scalar = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT32,
                             framework::make_ddim({1}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 93c8a50f02a78..6c082b9653e6f 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, flatten) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index bef0e2af4cf92..03f686f1c3f5e 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, matmul_cpu) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -41,7 +41,7 @@ TEST(API, matmul_cpu) {
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -79,10 +79,10 @@ TEST(API, matmul_cpu) {
 TEST(API, matmul_cuda) {
   // Prepare CPU Dense Tensor
   const auto alloc_cpu =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CPUPlace());
   auto ref_x = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -90,7 +90,7 @@ TEST(API, matmul_cuda) {
   auto* ref_x_data = ref_x->mutable_data<float>();
 
   auto ref_y = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -104,16 +104,16 @@ TEST(API, matmul_cuda) {
 
   // 1. create tensor
   const auto alloc_cuda =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
+      std::make_unique<paddle::experimental::DefaultAllocator>(
           paddle::platform::CUDAPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc_cuda,
+      alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      alloc_cuda,
+      alloc_cuda.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 3}),
                             pten::DataLayout::NCHW));
@@ -143,7 +143,7 @@ TEST(API, matmul_cuda) {
   auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
 
   auto ref_out = std::make_shared<pten::DenseTensor>(
-      alloc_cpu,
+      alloc_cpu.get(),
       pten::DenseTensorMeta(
           pten::DataType::FLOAT32, out.dims(), pten::DataLayout::NCHW));
 
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index a8c4c5306dced..9d90e58101cbd 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, mean) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 227dcc6e9568d..59e9e9fab1122 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, reshape) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_storage.cc b/paddle/pten/tests/api/test_storage.cc
deleted file mode 100644
index 1a5d95f9419c5..0000000000000
--- a/paddle/pten/tests/api/test_storage.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/api/lib/utils/storage.h"
-
-namespace paddle {
-namespace tests {
-
-TEST(host_storage, external_stroage) {
-  const size_t size{100};
-  const auto a = std::make_shared<experimental::DefaultAllocator>(
-      paddle::platform::CPUPlace());
-  pten::intrusive_ptr<pten::Storage> in_storage =
-      pten::make_intrusive<pten::TensorStorage>(a, size);
-  char* data = static_cast<char*>(in_storage->data());
-  for (size_t i = 0; i < size; ++i) {
-    data[i] = i;
-  }
-  const size_t delta{1};
-  const size_t n{10};
-  auto ex_storage =
-      pten::make_intrusive<experimental::ExternalStorage>(in_storage, delta, n);
-  CHECK_EQ(ex_storage->size(), n);
-  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
-  CHECK(!ex_storage->OwnsMemory());
-  for (size_t i = delta; i < delta + n; ++i) {
-    CHECK_EQ(data[i], static_cast<char>(i));
-  }
-}
-
-TEST(host_storage, external_vector) {
-  std::vector<char> data(100);
-  for (size_t i = 0; i < data.size(); ++i) {
-    data[i] = i;
-  }
-  const size_t delta{1};
-  const size_t n{10};
-  auto ex_storage = pten::make_intrusive<experimental::ExternalStorage>(
-      data.data(), n, paddle::platform::CPUPlace());
-  CHECK_EQ(ex_storage->size(), n);
-  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
-  CHECK(!ex_storage->OwnsMemory());
-  for (size_t i = delta; i < delta + n; ++i) {
-    CHECK_EQ(data[i], static_cast<char>(i));
-  }
-}
-
-}  // namespace tests
-}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index ff1609d3d4051..5a7c9840e1114 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, sum) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/api/test_tensor_utils.cc b/paddle/pten/tests/api/test_tensor_utils.cc
deleted file mode 100644
index 041bd28ad892a..0000000000000
--- a/paddle/pten/tests/api/test_tensor_utils.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/core/tensor_meta.h"
-
-namespace paddle {
-namespace tests {
-
-using DDim = paddle::framework::DDim;
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
-
-using DenseTensor = pten::DenseTensor;
-using DenseTensorMeta = pten::DenseTensorMeta;
-
-TEST(tensor_utils, dense_tensor_to_lod_tensor) {
-  const DDim dims({2, 1});
-  const DataType dtype{DataType::FLOAT32};
-  const DataLayout layout{DataLayout::NCHW};
-  const pten::LoD lod{{0, 2}};
-  DenseTensorMeta meta(dtype, dims, layout, lod);
-
-  auto alloc =
-      std::make_shared<experimental::DefaultAllocator>(platform::CPUPlace());
-
-  DenseTensor dense_tensor(alloc, meta);
-  float* data = dense_tensor.mutable_data<float>();
-  data[0] = 1.0f;
-  data[1] = 2.1f;
-
-  framework::LoDTensor lod_tensor;
-  experimental::MovesStorage(&dense_tensor, &lod_tensor);
-
-  CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
-  CHECK(dense_tensor.lod()[0] ==
-        static_cast<paddle::framework::Vector<size_t>>((lod_tensor.lod()[0])));
-  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(lod_tensor.type()));
-  CHECK(dense_tensor.layout() == lod_tensor.layout());
-  CHECK(platform::is_cpu_place(lod_tensor.place()));
-
-  CHECK(lod_tensor.data<float>()[0] == 1.0f);
-  CHECK(lod_tensor.data<float>()[1] == 2.1f);
-
-  auto dense_tensor_1 = experimental::MakePtenDenseTensor(lod_tensor);
-  CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->dtype() == dtype);
-  CHECK(dense_tensor_1->layout() == layout);
-  CHECK(dense_tensor_1->lod().size() == lod.size());
-  CHECK(dense_tensor_1->lod()[0] == lod[0]);
-  const float* data_1 = dense_tensor_1->data<float>();
-  CHECK(data_1[0] == 1.0f);
-  CHECK(data_1[1] == 2.1f);
-}
-
-TEST(tensor_utils, dense_tensor_to_tensor) {
-  const DDim dims({2, 1});
-  const DataType dtype{DataType::FLOAT32};
-  const DataLayout layout{DataLayout::NCHW};
-  DenseTensorMeta meta(dtype, dims, layout);
-
-  auto alloc =
-      std::make_shared<experimental::DefaultAllocator>(platform::CPUPlace());
-
-  DenseTensor dense_tensor(alloc, meta);
-  float* data = dense_tensor.mutable_data<float>();
-  data[0] = 1.0f;
-  data[1] = 2.1f;
-
-  framework::Tensor tensor;
-  experimental::MovesStorage(&dense_tensor, &tensor);
-
-  CHECK(dense_tensor.dtype() == pten::TransToPtenDataType(tensor.type()));
-  CHECK(dense_tensor.layout() == tensor.layout());
-  CHECK(platform::is_cpu_place(tensor.place()));
-
-  CHECK(tensor.data<float>()[0] == 1.0f);
-  CHECK(tensor.data<float>()[1] == 2.1f);
-
-  auto dense_tensor_1 = experimental::MakePtenDenseTensor(tensor);
-  CHECK(dense_tensor_1->dims() == dims);
-  CHECK(dense_tensor_1->dtype() == dtype);
-  CHECK(dense_tensor_1->layout() == layout);
-  const float* data_1 = dense_tensor_1->data<float>();
-  CHECK(data_1[0] == 1.0f);
-  CHECK(data_1[1] == 2.1f);
-}
-
-TEST(PtenUtils, VarToPtTensor) {
-  // 1. create Variable
-  paddle::framework::Variable v;
-  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
-  paddle::framework::Tensor* value = selected_rows->mutable_value();
-  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
-                                        paddle::platform::CPUPlace());
-  data[0] = 123;
-  pten::Backend expect_backend = pten::Backend::CPU;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pten::Backend::GPU;
-#endif
-  auto tensor_def = pten::TensorArgDef(
-      expect_backend, pten::DataLayout::NCHW, pten::DataType::INT32);
-  // 2. test API
-  auto tensor_x = experimental::MakePtenTensorBaseFromVar(v, tensor_def);
-  // 3. check result
-  ASSERT_EQ(tensor_x->dtype(), pten::DataType::INT32);
-}
-
-}  // namespace tests
-}  // namespace paddle
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index 47e8ff7c2c87e..9aef716029a69 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -28,10 +28,10 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 paddle::experimental::Tensor CreateInputTensor() {
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::INT64,
                             framework::make_ddim({3, 4}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/core/CMakeLists.txt b/paddle/pten/tests/core/CMakeLists.txt
index 9a5cfecc2917b..07554f02d9992 100644
--- a/paddle/pten/tests/core/CMakeLists.txt
+++ b/paddle/pten/tests/core/CMakeLists.txt
@@ -1,5 +1,3 @@
-cc_test(test_allocator SRCS test_allocator.cc DEPS tensor_base)
-cc_test(test_storage SRCS test_storage.cc DEPS tensor_base)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index c2c74e1aacf1f..e78f288e8e545 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -21,76 +21,19 @@ limitations under the License. */
 namespace pten {
 namespace tests {
 
-class HostAllocatorSample : public pten::deprecated::RawAllocator {
+class FancyAllocator : public pten::Allocator {
  public:
-  using Place = paddle::platform::Place;
-  void* Allocate(size_t bytes_size) override {
-    return ::operator new(bytes_size);
-  }
-  void Deallocate(void* ptr, size_t bytes_size) override {
-    return ::operator delete(ptr);
-  }
-  const Place& place() const override { return place_; }
-
- private:
-  Place place_{paddle::platform::CPUPlace()};
-};
-
-class FancyAllocator : public pten::deprecated::Allocator {
- public:
-  using Allocation = pten::deprecated::Allocation;
   static void Delete(Allocation* allocation) {
     ::operator delete(allocation->ptr());
   }
 
-  Allocation Allocate(size_t bytes_size) override {
+  AllocationPtr Allocate(size_t bytes_size) override {
     void* data = ::operator new(bytes_size);
-    return Allocation(data, data, &Delete, place());
-  }
-
-  const paddle::platform::Place& place() override { return place_; }
-
-  paddle::platform::Place place_ = paddle::platform::CPUPlace();
-};
-
-template <typename T>
-struct CustomAllocator {
-  using value_type = T;
-  using Allocator = pten::deprecated::RawAllocator;
-
-  explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
-      : alloc_(a) {}
-
-  CustomAllocator(const CustomAllocator&) noexcept = default;
-  T* allocate(std::size_t n) {
-    return static_cast<T*>(alloc_->Allocate(n * sizeof(T)));
-  }
-  void deallocate(T* p, std::size_t n) {
-    return alloc_->Deallocate(p, sizeof(T) * n);
+    auto* allocation =
+        new pten::Allocation(data, bytes_size, paddle::platform::CPUPlace());
+    return AllocationPtr(allocation, Delete);
   }
-
-  template <typename R, typename U>
-  friend bool operator==(const CustomAllocator<R>&,
-                         const CustomAllocator<U>&) noexcept;
-  template <typename R, typename U>
-  friend bool operator!=(const CustomAllocator<R>&,
-                         const CustomAllocator<U>&) noexcept;
-
- private:
-  std::shared_ptr<Allocator> alloc_;
 };
 
-template <typename T, typename U>
-inline bool operator==(const CustomAllocator<T>& lhs,
-                       const CustomAllocator<U>& rhs) noexcept {
-  return &lhs.alloc_ == &rhs.alloc_;
-}
-
-template <typename T, typename U>
-inline bool operator!=(const CustomAllocator<T>& lhs,
-                       const CustomAllocator<U>& rhs) noexcept {
-  return &lhs.alloc_ != &rhs.alloc_;
-}
-
 }  // namespace tests
 }  // namespace pten
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
deleted file mode 100644
index 94ba9a1e1b9a2..0000000000000
--- a/paddle/pten/tests/core/test_allocator.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <algorithm>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/pten/tests/core/allocator.h"
-#include "paddle/pten/tests/core/random.h"
-#include "paddle/pten/tests/core/timer.h"
-
-namespace pten {
-namespace tests {
-
-using RawAllocator = pten::deprecated::RawAllocator;
-using Allocator = pten::deprecated::Allocator;
-using Allocation = pten::deprecated::Allocation;
-
-template <typename T>
-bool host_allocator_test(size_t vector_size) {
-  std::vector<T> src(vector_size);
-  std::generate(src.begin(), src.end(), make_generator(src));
-  std::vector<T, CustomAllocator<T>> dst(
-      src.begin(),
-      src.end(),
-      CustomAllocator<T>(std::make_shared<HostAllocatorSample>()));
-  return std::equal(src.begin(), src.end(), dst.begin());
-}
-
-TEST(raw_allocator, host) {
-  CHECK(host_allocator_test<float>(1000));
-  CHECK(host_allocator_test<int32_t>(1000));
-  CHECK(host_allocator_test<int64_t>(1000));
-}
-
-class StorageRawAlloc {
- public:
-  StorageRawAlloc(const std::shared_ptr<RawAllocator>& a, size_t size)
-      : alloc_(a) {
-    data_ = alloc_->Allocate(size);
-  }
-  ~StorageRawAlloc() { alloc_->Deallocate(data_, size); }
-
- private:
-  void* data_;
-  size_t size;
-  std::shared_ptr<RawAllocator> alloc_;
-};
-
-class StorageFancyAlloc {
- public:
-  StorageFancyAlloc(const std::shared_ptr<Allocator>& a, size_t size)
-      : alloc_(a), allocation_(a->Allocate(size)) {}
-
- private:
-  std::shared_ptr<Allocator> alloc_;
-  Allocation allocation_;
-};
-
-TEST(benchmark, allocator) {
-  std::shared_ptr<RawAllocator> raw_allocator(new HostAllocatorSample);
-  std::shared_ptr<Allocator> fancy_allocator(new FancyAllocator);
-  const size_t cycles = 100;
-  Timer timer;
-  double t1{}, t2{};
-  for (size_t i = 0; i < cycles; ++i) {
-    timer.tic();
-    for (size_t i = 0; i < cycles; ++i) {
-      StorageRawAlloc(raw_allocator, i * 100);
-    }
-    t1 += timer.toc();
-    timer.tic();
-    for (size_t i = 0; i < cycles; ++i) {
-      StorageFancyAlloc(fancy_allocator, i * 100);
-    }
-    t2 += timer.toc();
-  }
-  std::cout << "The cost of raw alloc is " << t1 << "ms.\n";
-  std::cout << "The cost of fancy alloc with place is " << t2 << "ms.\n";
-}
-
-}  // namespace tests
-}  // namespace pten
diff --git a/paddle/pten/tests/core/test_dense_tensor.cc b/paddle/pten/tests/core/test_dense_tensor.cc
index 8277c0d8dadb7..8564969796c7e 100644
--- a/paddle/pten/tests/core/test_dense_tensor.cc
+++ b/paddle/pten/tests/core/test_dense_tensor.cc
@@ -75,7 +75,8 @@ TEST(dense_tensor, ctor) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
 
   auto check_dense_tensor = [](const DenseTensor& t,
                                const DenseTensorMeta& m) -> bool {
@@ -95,10 +96,6 @@ TEST(dense_tensor, ctor) {
 
   DenseTensor tensor_1(alloc, DenseTensorMeta(meta));
   check_dense_tensor(tensor_0, meta);
-
-  DenseTensor tensor_2(make_intrusive<TensorStorage>(alloc), meta);
-  CHECK_NOTNULL(tensor_2.mutable_data<int8_t>());
-  check_dense_tensor(tensor_2, meta);
 }
 
 TEST(dense_tensor, resize) {
@@ -108,7 +105,8 @@ TEST(dense_tensor, resize) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
   DenseTensor tensor_0(alloc, meta);
 
   CHECK_EQ(tensor_0.capacity(), 2u);
@@ -125,7 +123,8 @@ TEST(dense_tensor, shallow_copy) {
   const LoD lod{};
   DenseTensorMeta meta(dtype, dims, layout, lod);
 
-  auto alloc = std::make_shared<FancyAllocator>();
+  auto fancy_allocator = std::unique_ptr<Allocator>(new FancyAllocator);
+  auto* alloc = fancy_allocator.get();
   DenseTensor tensor_0(alloc, meta);
 
   DenseTensor tensor_1(tensor_0);
diff --git a/paddle/pten/tests/core/test_storage.cc b/paddle/pten/tests/core/test_storage.cc
deleted file mode 100644
index 69d1eae668c58..0000000000000
--- a/paddle/pten/tests/core/test_storage.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <vector>
-
-#include "gtest/gtest.h"
-
-#include "paddle/pten/core/storage.h"
-#include "paddle/pten/tests/core/allocator.h"
-
-namespace pten {
-namespace tests {
-
-TEST(host_storage, internal) {
-  // TODO(Shixiaowei02): Here we need to consider the case
-  // where the size is zero.
-  const size_t size{100};
-  const auto a = std::make_shared<FancyAllocator>();
-  TensorStorage storage(a, size);
-  CHECK_EQ(storage.size(), size);
-  CHECK(paddle::platform::is_cpu_place(storage.place()));
-  CHECK(storage.OwnsMemory());
-  CHECK(storage.allocator() == a);
-  storage.Realloc(size + 100);
-  CHECK_EQ(storage.size(), size + 100);
-}
-
-}  // namespace tests
-}  // namespace pten
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index cb45d827e3be9..90624adeb344e 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -31,9 +31,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, cast) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 3392626dc2ad3..789d55491f368 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, conj) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::COMPLEX64,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index 3095c83d97c98..c4d8c37eb9e0f 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -31,17 +31,17 @@ using DDim = paddle::framework::DDim;
 // in 'paddle/api'
 TEST(DEV_API, copy) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   auto dense_src = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
   auto* dense_x_data = dense_src->mutable_data<float>();
 
   auto dense_dst = std::make_shared<pten::DenseTensor>(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index 4d753f7d09b8e..169a77cf3436b 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -50,9 +50,9 @@ TEST(DEV_API, empty) {
 
 TEST(DEV_API, empty_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
@@ -105,9 +105,9 @@ TEST(DEV_API, full) {
 
 TEST(DEV_API, full_like) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 2}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 6e2166cb673bd..a5773b8aa9690 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -29,15 +29,15 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index bd09ecb770a5d..40998a8d57caa 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -29,15 +29,15 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, add) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -82,15 +82,15 @@ TEST(DEV_API, add) {
 
 TEST(DEV_API, subtract) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -135,15 +135,15 @@ TEST(DEV_API, subtract) {
 
 TEST(DEV_API, divide) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
@@ -188,15 +188,15 @@ TEST(DEV_API, divide) {
 
 TEST(DEV_API, multiply) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  pten::DenseTensor dense_y(alloc,
+  pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({10}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index f18e5c050ba70..d66ff468fcf48 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -39,10 +39,10 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, flatten) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   pten::DenseTensor dense_x(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 7ac3d19554581..0c1338f195563 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -29,16 +29,16 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  DenseTensor dense_x(alloc,
+  DenseTensor dense_x(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
 
   auto* dense_x_data = dense_x.mutable_data<float>();
 
-  DenseTensor dense_y(alloc,
+  DenseTensor dense_y(alloc.get(),
                       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                             framework::make_ddim({3, 3}),
                                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index 4b254e7e6c1ac..98782fd5dae0b 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, mean) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 0196e1c211004..02139d02de17e 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -30,10 +30,10 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(DEV_API, reshape) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   pten::DenseTensor dense_x(
-      alloc,
+      alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 2, 2, 3}),
                             pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index fe26f56552b05..02f324deb4cec 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, scale) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
@@ -69,9 +69,9 @@ TEST(DEV_API, scale) {
 
 TEST(DEV_API, scale_host) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));
@@ -79,9 +79,8 @@ TEST(DEV_API, scale_host) {
   for (size_t i = 0; i < 12; ++i) {
     dense_x_data[i] = i * 1.0;
   }
-  const auto alloc2 = std::make_shared<paddle::experimental::DefaultAllocator>(
-      paddle::platform::CPUPlace());
-  pten::DenseTensor scale(alloc2,
+
+  pten::DenseTensor scale(alloc.get(),
                           pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                 framework::make_ddim({1}),
                                                 pten::DataLayout::NCHW));
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index afaf903063781..312a6ce6100bb 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -29,9 +29,9 @@ using DDim = paddle::framework::DDim;
 
 TEST(DEV_API, sum) {
   // 1. create tensor
-  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
-  pten::DenseTensor dense_x(alloc,
+  pten::DenseTensor dense_x(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 4}),
                                                   pten::DataLayout::NCHW));