From e5cda6faa08b884e6bf5ae8b13da7691a364d5e6 Mon Sep 17 00:00:00 2001
From: Yulong Ao <aoyulong@baidu.com>
Date: Fri, 21 Jan 2022 11:56:32 +0800
Subject: [PATCH 01/15] [Auto Parallel] Use the new completion algorithm
 (#39086)

* Add the backward support for QR

* Remove unnecessary comments

* [Auto Parallel] Improve the dist op interface and compatible computation

* Remove unnecessary modification

* Recover some modifications

* Add lost files

* Fix a minor bug

* Fix the bug of the planner

* Fix the format problem

* [Auto Parallel] Update the completion algorithm

* Fix the bug of auto_searcher unittest
---
 .../distributed/auto_parallel/__init__.py     |    6 -
 .../distributed/auto_parallel/completion.py   | 1414 +++++++----------
 .../distributed/auto_parallel/dist_context.py |   34 +-
 .../distributed/auto_parallel/parallelizer.py |   15 +-
 .../test_auto_parallel_completion.py          |   66 +-
 .../test_auto_parallel_completion_gpt.py      |   22 +-
 .../test_auto_parallel_cost_model.py          |    6 +-
 .../test_auto_parallel_dist_tensor.py         |    6 +-
 .../unittests/test_auto_parallel_mapper.py    |   12 +-
 .../test_auto_parallel_partitioner.py         |    6 +-
 .../test_auto_parallel_partitioner_gpt.py     |   11 +-
 .../unittests/test_auto_parallel_reshard.py   |    7 +-
 .../test_auto_parallel_reshard_dpmppp.py      |    6 +-
 .../test_auto_parallel_reshard_mppp.py        |   11 +-
 .../unittests/test_auto_parallel_searcher.py  |    4 +-
 15 files changed, 686 insertions(+), 940 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 3b5ccaa062f6e..edcd53bdc7a52 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -15,12 +15,6 @@
 from .interface import shard_tensor  # noqa: F401
 from .interface import shard_op  # noqa: F401
 from .process_mesh import ProcessMesh
-# from .interface import set_shard_mask  # noqa: F401
-# from .interface import set_offload_device  # noqa: F401
-# from .interface import set_pipeline_stage  # noqa: F401
-# from .interface import ProcessMesh  # noqa: F401
-from .completion import complete_annotation  # noqa: F401
-from .completion import complete_backward_annotation  # noqa: F401
 from .reshard import reshard  # noqa: F401
 from .cost_model import estimate_cost
 
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 660b1a54221a7..54491f9e6c16e 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 from copy import deepcopy
+import time
 
 from paddle.fluid import core
 from paddle.fluid import framework
 
-from .utils import compute_compatible_process_mesh
-from .utils import compute_compatible_dim_mapping
-from .utils import compute_compatible_dims_mapping
 from .utils import print_program_with_dist_attr
 from .operators import find_best_compatible_distributed_operator_impl
 from .dist_context import get_default_distributed_context
@@ -29,865 +28,602 @@
 from .dist_attribute import OperatorDistributedAttribute
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
 
-ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"]
 
+def compute_compatible_process_mesh(process_mesh_list):
+    """Compute the compatible process mesh given a list of process meshes."""
+    if not process_mesh_list:
+        return None
 
-def is_elementwise_like_op(op_type):
-    if op_type in ELEMENTWISE_LIKE_OP_LIST:
-        return True
-    else:
-        return False
-
+    def _compute_compatible_process_mesh_two(pm1, pm2):
+        if pm1 is None:
+            return True, pm2
+        if pm2 is None:
+            return True, pm1
+        if pm1 == pm2:
+            return True, pm1
+        if pm1.processes == pm2.processes:
+            if len(pm1.topology) >= len(pm2.topology):
+                return True, pm1
+            else:
+                return True, pm2
+        process_set1 = set(pm1.processes)
+        process_set2 = set(pm2.processes)
+        if process_set1.issubset(process_set2):
+            return True, pm2
+        if process_set2.issubset(process_set1):
+            return True, pm1
+        return False, None
+
+    compatible_result = None
+    for process_mesh in process_mesh_list:
+        compatible, compatible_result = _compute_compatible_process_mesh_two(
+            compatible_result, process_mesh)
+        if not compatible:
+            return None
+    return copy.deepcopy(compatible_result)
+
+
+def compute_compatible_dim_mapping(dim_mapping_list):
+    """Compute the compatible dim mapping given a list of dim mapping."""
+    if not dim_mapping_list:
+        return None
 
-def update_tensor_node_process_mesh(dist_context, tensor_node, fwd=True):
-    """
-    Update tensor's process mesh by using its predecessor's process mesh if in the forward direction, 
-    and by using its successor's process mesh if in the backward direction. Note: only the equal 
-    process meshes are compatible for now.
+    def _compute_compatible_dim_mapping_two(dm1, dm2):
+        if dm1 == -1:
+            return True, dm2
+        if dm2 == -1:
+            return True, dm1
+        if dm1 == dm2:
+            return True, dm1
+        return False, None
+
+    compatible_result = -1
+    for mapping in dim_mapping_list:
+        compatible, compatible_result = _compute_compatible_dim_mapping_two(
+            compatible_result, mapping)
+        if not compatible:
+            return None
+    return compatible_result
+
+
+def compute_compatible_dims_mapping(dims_mapping_list):
+    """Compute the compatible dims mapping given a list of dims mapping.
+       Each of dims mapping is also a list.
     """
-    changed = False
-    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node)
-    if tensor_dist_attr.is_annotated("process_mesh"):
-        return changed
-    tensor_process_mesh = tensor_dist_attr.process_mesh
-    if fwd:
-        inputs_process_meshes = []
-        for pred_op_node in tensor_node.inputs:
-            if pred_op_node.op() is not None:
-                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                    pred_op_node)
-                op_process_mesh = op_dist_attr.process_mesh
-                inputs_process_meshes.append(op_process_mesh)
-        compatible_process_mesh = compute_compatible_process_mesh(
-            inputs_process_meshes)
-        if compatible_process_mesh is not None and tensor_process_mesh is None:
-            tensor_dist_attr.process_mesh = compatible_process_mesh
-            changed = True
-    else:
-        outputs_process_meshes = []
-        for succ_op_node in tensor_node.outputs:
-            if succ_op_node.op() is not None:
-                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                    succ_op_node)
-                op_process_mesh = op_dist_attr.process_mesh
-                outputs_process_meshes.append(op_process_mesh)
-        compatible_process_mesh = compute_compatible_process_mesh(
-            outputs_process_meshes)
-        if compatible_process_mesh is not None and tensor_process_mesh is None:
-            tensor_dist_attr.process_mesh = compatible_process_mesh
-            changed = True
-    return changed
-
-
-def update_op_node_process_mesh(dist_context, op_node, fwd=True):
-    """
-    Update op's process mesh by using its predecessor's process mesh if in the forward direction, 
-    and by using its successor's process mesh if in the backward direction. Note: only the equal 
-    process meshes are compatible for now.
-    """
-    changed = False
-    op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node)
-    if op_dist_attr.is_annotated("process_mesh"):
-        return changed
-    op_process_mesh = op_dist_attr.process_mesh
-    if fwd:
-        inputs_process_meshes = []
-        for tensor_node in op_node.inputs:
-            if tensor_node.var() is not None:
-                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                    tensor_node)
-                tensor_process_mesh = tensor_dist_attr.process_mesh
-                inputs_process_meshes.append(tensor_process_mesh)
-        compatible_process_mesh = compute_compatible_process_mesh(
-            inputs_process_meshes)
-        if compatible_process_mesh is not None and op_process_mesh is None:
-            op_dist_attr.process_mesh = compatible_process_mesh
-            changed = True
-    else:
-        outputs_process_meshes = []
-        for tensor_node in op_node.outputs:
-            if tensor_node.var() is not None:
-                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                    tensor_node)
-                tensor_process_mesh = tensor_dist_attr.process_mesh
-                outputs_process_meshes.append(tensor_process_mesh)
-        compatible_process_mesh = compute_compatible_process_mesh(
-            outputs_process_meshes)
-        if compatible_process_mesh is not None and op_process_mesh is None:
-            op_dist_attr.process_mesh = compatible_process_mesh
-            changed = True
-    return changed
-
-
-def update_op_dims_mapping_by_default_dist_impl(dist_context, op_node):
-    """Each operator has a default distributed operator, only allowed to be sharded in batch dimension."""
-    changed = False
-    if (not op_node.is_op()) or (op_node.op() is None):
-        return False
-    op_desc = op_node.op()
-    dist_op = dist_context.get_dist_op_for_graph(op_node)
-    op_dist_attr = dist_op.dist_attr
-    # The following statement will be replaced by a more elegent way
-    if op_desc.type() == "shape" or op_desc.type() == "slice":
-        return False
-    output_names = op_desc.output_names()
-    xshape_arg_names = []
-    if "XShape" in output_names:
-        xshape_arg_names = op_desc.output("XShape")
-    batch_dim_mappings = []
-    for arg_name in op_desc.input_arg_names():
-        serial_tensor = dist_op.get_serial_input(arg_name)
-        if serial_tensor.is_parameter:
-            continue
-        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-        if len(dims_mapping) > 1:
-            for idx, mapping in enumerate(dims_mapping[1:]):
-                assert mapping == -1, \
-                    "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
-                        .format(op_desc.type(), idx, mapping)
-        batch_dim_mappings.append(dims_mapping[0])
-    for arg_name in op_desc.output_arg_names():
-        serial_tensor = dist_op.get_serial_output(arg_name)
-        if serial_tensor.is_parameter:
-            continue
-        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-        if arg_name not in xshape_arg_names:
-            if len(dims_mapping) > 1:
-                for idx, mapping in enumerate(dims_mapping[1:]):
-                    assert mapping == -1, \
-                        "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
-                            .format(op_desc.type(), idx, mapping)
-            batch_dim_mappings.append(dims_mapping[0])
-        else:
-            assert dims_mapping[0] == -1, \
-                "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part."\
-                    .format(op_desc.type(), mapping)
-            if len(dims_mapping) > 2:
-                for idx, mapping in enumerate(dims_mapping[2:]):
-                    assert mapping == -1, \
-                        "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part."\
-                            .format(op_desc.type(), idx, mapping)
-            batch_dim_mappings.append(dims_mapping[1])
-
-    compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
-    assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
-    for arg_name in op_desc.input_arg_names():
-        serial_tensor = dist_op.get_serial_input(arg_name)
-        if serial_tensor.is_parameter:
-            continue
-        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-        if compatible_dim_mapping != dims_mapping[0]:
-            dims_mapping[0] = compatible_dim_mapping
-            changed = True
-    for arg_name in op_desc.output_arg_names():
-        serial_tensor = dist_op.get_serial_output(arg_name)
-        if serial_tensor.is_parameter:
-            continue
-        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-        if arg_name not in xshape_arg_names:
-            if compatible_dim_mapping != dims_mapping[0]:
-                dims_mapping[0] = compatible_dim_mapping
+    if not dims_mapping_list:
+        return None
+    length = len(dims_mapping_list[0])
+    for dims_mapping in dims_mapping_list:
+        if dims_mapping is None:
+            return None
+        if len(dims_mapping) != length:
+            return None
+    compatible_result = []
+    for dim_mappings in zip(*dims_mapping_list):
+        compatible_dim_mapping = compute_compatible_dim_mapping(
+            list(dim_mappings))
+        if compatible_dim_mapping is None:
+            return None
+        compatible_result.append(compatible_dim_mapping)
+    return compatible_result
+
+
+class Completer:
+    def __init__(self, dist_context):
+        assert dist_context is not None
+        self._dist_context = dist_context
+
+    def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True):
+        changed = False
+        if (not tensor_node.is_var()) or (tensor_node.var() is None):
+            return False
+        tensor_desc = tensor_node.var()
+        # Skip reader tensor
+        if tensor_desc.type() == core.VarDesc.VarType.READER:
+            return False
+        tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+            tensor_node)
+        assert tensor_dist_attr is not None
+        if tensor_dist_attr.is_annotated("dims_mapping"):
+            return False
+        tensor_dims_mapping = tensor_dist_attr.dims_mapping
+        if fwd:
+            dims_mapping_list = []
+            for pred_op_node in tensor_node.inputs:
+                if pred_op_node.op() is not None:
+                    if pred_op_node.op().type() == "create_py_reader" \
+                        or pred_op_node.op().type() == "create_double_buffer_reader" \
+                        or pred_op_node.op().type() == "read":
+                        continue
+                    op_dist_attr = self._dist_context.get_op_dist_attr_for_graph(
+                        pred_op_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_desc.name())
+                        dims_mapping_list.append(op_dims_mapping)
+            dims_mapping_list.append(tensor_dims_mapping)
+            compatible_dims_mapping = compute_compatible_dims_mapping(
+                dims_mapping_list)
+            if (compatible_dims_mapping is not None) and \
+                (compatible_dims_mapping != tensor_dims_mapping):
+                tensor_dist_attr.dims_mapping = compatible_dims_mapping
                 changed = True
         else:
-            if compatible_dim_mapping != dims_mapping[1]:
-                dims_mapping[1] = compatible_dim_mapping
+            dims_mapping_list = []
+            for succ_op_node in tensor_node.outputs:
+                if succ_op_node.op() is not None:
+                    if succ_op_node.op().type() == "create_py_reader" \
+                        or succ_op_node.op().type() == "create_double_buffer_reader" \
+                        or succ_op_node.op().type() == "read":
+                        continue
+                    op_dist_attr = self._dist_context.get_op_dist_attr_for_graph(
+                        succ_op_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        op_dims_mapping = op_dist_attr.get_input_dims_mapping(
+                            tensor_desc.name())
+                        dims_mapping_list.append(op_dims_mapping)
+            dims_mapping_list.append(tensor_dims_mapping)
+            compatible_dims_mapping = compute_compatible_dims_mapping(
+                dims_mapping_list)
+            if (compatible_dims_mapping is not None) and \
+                (compatible_dims_mapping != tensor_dims_mapping):
+                tensor_dist_attr.dims_mapping = compatible_dims_mapping
                 changed = True
+        return changed
 
-    return changed
-
-
-def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_context, op_node):
-    """Element-wise operator can be sharded in any way (but should take care of broadcasting)."""
-    changed = False
-    if (not op_node.is_op()) or (op_node.op() is None):
-        return False
-    op_desc = op_node.op()
-    op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node)
-
-    input_arg_names = op_desc.input_arg_names()
-    input_dims_mapping_dict = {}
-    input_dims_mapping_lens = {}
-    max_dims_mapping_len = -1
-    for arg_name in input_arg_names:
-        dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
-        if max_dims_mapping_len < len(dims_mapping):
-            max_dims_mapping_len = len(dims_mapping)
-        input_dims_mapping_dict[arg_name] = dims_mapping
-        input_dims_mapping_lens[arg_name] = len(dims_mapping)
-
-    dims_mapping_list = []
-    for arg_name in input_arg_names:
-        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
-            new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
-            for i in range(input_dims_mapping_lens[arg_name]):
-                new_idx = (max_dims_mapping_len -
-                           input_dims_mapping_lens[arg_name]) + i
-                new_dims_mapping[new_idx] = input_dims_mapping_dict[arg_name][i]
-            dims_mapping_list.append(new_dims_mapping)
-        else:
-            dims_mapping_list.append(input_dims_mapping_dict[arg_name])
-    output_arg_names = op_desc.output_arg_names()
-    for arg_name in output_arg_names:
-        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-        assert len(dims_mapping) == max_dims_mapping_len
-        dims_mapping_list.append(dims_mapping)
-
-    compatible_dims_mapping = compute_compatible_dims_mapping(dims_mapping_list)
-    assert compatible_dims_mapping is not None, "There is no compatible dim mapping."
-
-    for arg_name in input_arg_names:
-        if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
-            new_dims_mapping = [
-                -1 for _ in range(input_dims_mapping_lens[arg_name])
-            ]
-            for i in range(input_dims_mapping_lens[arg_name]):
-                new_idx = (max_dims_mapping_len -
-                           input_dims_mapping_lens[arg_name]) + i
-                new_dims_mapping[i] = compatible_dims_mapping[new_idx]
-            if new_dims_mapping != input_dims_mapping_dict[arg_name]:
-                op_dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
+    def _update_op_node_dims_mapping(self, op_node, fwd=True):
+        changed = False
+        if (not op_node.is_op()) or (op_node.op() is None):
+            return False
+        # Skip reader op
+        op_desc = op_node.op()
+        if op_desc.type() == "create_py_reader" \
+            or op_desc.type() == "create_double_buffer_reader" \
+            or op_desc.type() == "read":
+            return False
+        dist_op = self._dist_context.get_dist_op_for_graph(op_node)
+        op_dist_attr = dist_op.dist_attr
+        if fwd:
+            for tensor_node in op_node.inputs:
+                if tensor_node.var() is not None:
+                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                        continue
+                    tensor_desc = tensor_node.var()
+                    if op_dist_attr.is_annotated_input_dims_mapping(
+                            tensor_desc.name()):
+                        continue
+                    tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                        tensor_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        tensor_dims_mapping = tensor_dist_attr.dims_mapping
+                        op_dims_mapping = op_dist_attr.get_input_dims_mapping(
+                            tensor_desc.name())
+                        compatible_dims_mapping = compute_compatible_dims_mapping(
+                            [op_dims_mapping, tensor_dims_mapping])
+                        if (compatible_dims_mapping is not None) and \
+                            (compatible_dims_mapping != op_dims_mapping):
+                            op_dist_attr.set_input_dims_mapping(
+                                tensor_desc.name(), compatible_dims_mapping)
+                            changed = True
+            # Find the most compatible implemenetations from the distributed operator
+            op_dist_impl = find_best_compatible_distributed_operator_impl(
+                dist_op, fwd=True)
+            assert op_dist_impl is not None, "Cannot find the dist op implementation."
+            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+            if dim_changed:
                 changed = True
+            if op_dist_impl.is_auto_compatible(dist_op):
+                if op_dist_impl.type == "elementwise":
+                    op_dist_attr.impl_type = "default"
+                else:
+                    op_dist_attr.impl_type = op_dist_impl.type
+                op_dist_attr.impl_idx = op_dist_impl.idx
         else:
-            if compatible_dims_mapping != input_dims_mapping_dict[arg_name]:
-                op_dist_attr.set_input_dims_mapping(arg_name,
-                                                    compatible_dims_mapping)
+            for tensor_node in op_node.outputs:
+                if tensor_node.var() is not None:
+                    if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                        continue
+                    tensor_desc = tensor_node.var()
+                    if op_dist_attr.is_annotated_output_dims_mapping(
+                            tensor_desc.name()):
+                        continue
+                    tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                        tensor_node)
+                    if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh:
+                        tensor_dims_mapping = tensor_dist_attr.dims_mapping
+                        op_dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_desc.name())
+                        compatible_dims_mapping = compute_compatible_dims_mapping(
+                            [op_dims_mapping, tensor_dims_mapping])
+                        if (compatible_dims_mapping is not None) and \
+                            (compatible_dims_mapping != op_dims_mapping):
+                            op_dist_attr.set_output_dims_mapping(
+                                tensor_desc.name(), compatible_dims_mapping)
+                            changed = True
+            # Find the most compatible implemenetations from the distributed operator
+            op_dist_impl = find_best_compatible_distributed_operator_impl(
+                dist_op, fwd=False)
+            assert op_dist_impl is not None, "Cannot find the dist op implementation."
+            dim_changed = op_dist_impl.update_dims_mapping(dist_op)
+            if dim_changed:
                 changed = True
+            if op_dist_impl.is_auto_compatible(dist_op):
+                if op_dist_impl.type == "elementwise":
+                    op_dist_attr.impl_type = "default"
+                else:
+                    op_dist_attr.impl_type = op_dist_impl.type
+                op_dist_attr.impl_idx = op_dist_impl.idx
+        return changed
 
-    for arg_name in output_arg_names:
-        dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
-        if compatible_dims_mapping != dims_mapping:
-            op_dist_attr.set_output_dims_mapping(arg_name,
-                                                 compatible_dims_mapping)
-            changed = True
-
-    return changed
-
-
-def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
-    changed = False
-    if (not tensor_node.is_var()) or (tensor_node.var() is None):
-        return False
-    tensor_desc = tensor_node.var()
-    # Skip reader tensor
-    if tensor_desc.type() == core.VarDesc.VarType.READER:
-        return False
-    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node)
-    assert tensor_dist_attr is not None
-    if tensor_dist_attr.is_annotated("dims_mapping"):
-        return False
-    tensor_dims_mapping = tensor_dist_attr.dims_mapping
-    if fwd:
-        dims_mapping_list = []
-        for pred_op_node in tensor_node.inputs:
-            if pred_op_node.op() is not None:
-                if pred_op_node.op().type() == "create_py_reader" \
-                    or pred_op_node.op().type() == "create_double_buffer_reader" \
-                    or pred_op_node.op().type() == "read":
-                    continue
-                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                    pred_op_node)
-                op_dims_mapping = op_dist_attr.get_output_dims_mapping(
-                    tensor_desc.name())
-                dims_mapping_list.append(op_dims_mapping)
-        dims_mapping_list.append(tensor_dims_mapping)
-        compatible_dims_mapping = compute_compatible_dims_mapping(
-            dims_mapping_list)
-        if (compatible_dims_mapping is not None) and \
-            (compatible_dims_mapping != tensor_dims_mapping):
-            tensor_dist_attr.dims_mapping = compatible_dims_mapping
-            changed = True
-    else:
-        dims_mapping_list = []
-        for succ_op_node in tensor_node.outputs:
-            if succ_op_node.op() is not None:
-                if succ_op_node.op().type() == "create_py_reader" \
-                    or succ_op_node.op().type() == "create_double_buffer_reader" \
-                    or succ_op_node.op().type() == "read":
-                    continue
-                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                    succ_op_node)
-                op_dims_mapping = op_dist_attr.get_input_dims_mapping(
-                    tensor_desc.name())
-                dims_mapping_list.append(op_dims_mapping)
-        dims_mapping_list.append(tensor_dims_mapping)
-        compatible_dims_mapping = compute_compatible_dims_mapping(
-            dims_mapping_list)
-        if (compatible_dims_mapping is not None) and \
-            (compatible_dims_mapping != tensor_dims_mapping):
-            tensor_dist_attr.dims_mapping = compatible_dims_mapping
-            changed = True
-    return changed
-
-
-def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
-    changed = False
-    if (not op_node.is_op()) or (op_node.op() is None):
-        return False
-    # Skip reader op
-    op_desc = op_node.op()
-    if op_desc.type() == "create_py_reader" \
-        or op_desc.type() == "create_double_buffer_reader" \
-        or op_desc.type() == "read":
-        return False
-    dist_op = dist_context.get_dist_op_for_graph(op_node)
-    op_dist_attr = dist_op.dist_attr
-    if fwd:
-        for tensor_node in op_node.inputs:
-            if tensor_node.var() is not None:
-                if tensor_node.var().type() == core.VarDesc.VarType.READER:
-                    continue
-                tensor_desc = tensor_node.var()
-                if op_dist_attr.is_annotated_input_dims_mapping(
-                        tensor_desc.name()):
-                    continue
-                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                    tensor_node)
-                tensor_dims_mapping = tensor_dist_attr.dims_mapping
-                op_dims_mapping = op_dist_attr.get_input_dims_mapping(
-                    tensor_desc.name())
-                compatible_dims_mapping = compute_compatible_dims_mapping(
-                    [op_dims_mapping, tensor_dims_mapping])
-                if (compatible_dims_mapping is not None) and \
-                    (compatible_dims_mapping != op_dims_mapping):
-                    op_dist_attr.set_input_dims_mapping(tensor_desc.name(),
-                                                        compatible_dims_mapping)
-                    changed = True
-        # Find the most compatible implemenetations from the distributed operator
-        op_dist_impl = find_best_compatible_distributed_operator_impl(
-            dist_op, fwd=True)
-        assert op_dist_impl is not None, "Cannot find the dist op implementation."
-        dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-        if dim_changed:
-            changed = True
-        if op_dist_impl.is_auto_compatible(dist_op):
-            if op_dist_impl.type == "elementwise":
-                op_dist_attr.impl_type = "default"
-            else:
-                op_dist_attr.impl_type = op_dist_impl.type
-            op_dist_attr.impl_idx = op_dist_impl.idx
-    else:
-        for tensor_node in op_node.outputs:
-            if tensor_node.var() is not None:
-                if tensor_node.var().type() == core.VarDesc.VarType.READER:
-                    continue
-                tensor_desc = tensor_node.var()
-                if op_dist_attr.is_annotated_output_dims_mapping(
-                        tensor_desc.name()):
-                    continue
-                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                    tensor_node)
-                tensor_dims_mapping = tensor_dist_attr.dims_mapping
-                op_dims_mapping = op_dist_attr.get_output_dims_mapping(
-                    tensor_desc.name())
-                compatible_dims_mapping = compute_compatible_dims_mapping(
-                    [op_dims_mapping, tensor_dims_mapping])
-                if (compatible_dims_mapping is not None) and \
-                    (compatible_dims_mapping != op_dims_mapping):
-                    op_dist_attr.set_output_dims_mapping(
-                        tensor_desc.name(), compatible_dims_mapping)
-                    changed = True
-        # Find the most compatible implemenetations from the distributed operator
-        op_dist_impl = find_best_compatible_distributed_operator_impl(
-            dist_op, fwd=False)
-        assert op_dist_impl is not None, "Cannot find the dist op implementation."
-        dim_changed = op_dist_impl.update_dims_mapping(dist_op)
-        if dim_changed:
-            changed = True
-        if op_dist_impl.is_auto_compatible(dist_op):
-            if op_dist_impl.type == "elementwise":
-                op_dist_attr.impl_type = "default"
+    def _update_process_mesh(self):
+        def _find_nearset_node(nodes, idx):
+            for node in reversed(nodes[:idx]):
+                node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                    node)
+                if node_dist_attr.process_mesh is not None:
+                    return node
+
+        total_reach_fix_point = False
+        while not total_reach_fix_point:
+            total_changed = False
+            for is_fwd in [True, False]:
+                all_nodes = self._dist_context.serial_ordered_nodes \
+                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
+                reach_fix_point = False
+                while not reach_fix_point:
+                    changed = False
+                    for idx, node in enumerate(all_nodes):
+                        nearest_node = _find_nearset_node(
+                            self._dist_context.serial_ordered_nodes, idx)
+                        if nearest_node is None:
+                            continue
+                        nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph(
+                            nearest_node)
+                        nearest_process_mesh = nearest_node_dis_attr.process_mesh
+                        cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph(
+                            node)
+                        cur_process_mesh = cur_node_dist_attr.process_mesh
+                        compatible_process_mesh = compute_compatible_process_mesh(
+                            [cur_process_mesh, nearest_process_mesh])
+                        if compatible_process_mesh is not None \
+                            and cur_process_mesh != compatible_process_mesh:
+                            cur_node_dist_attr.process_mesh = compatible_process_mesh
+                            changed = True
+                    if changed:
+                        reach_fix_point = False
+                        total_changed = True
+                    else:
+                        reach_fix_point = True
+            if total_changed:
+                total_reach_fix_point = False
             else:
-                op_dist_attr.impl_type = op_dist_impl.type
-            op_dist_attr.impl_idx = op_dist_impl.idx
-    return changed
-
-
-def complete_annotation(program, dist_context=None):
-    """ Complete annotation for the partial annotated program.
-
-    Arguments:
-        program: partial annotated program.
-        dist_context: the distributed context is used to store distributed attributes for program.
-            If not provided, the default one will be used.
-    Returns:
-        program: completed annotated program.
-    """
-
-    # Use the default distribted context for completeion if there is no one
-    if dist_context is None:
-        dist_context = get_default_distributed_context()
-        dist_context.serial_program = program
-    else:
-        dist_context.serial_program = program
-
-    # print_program_with_dist_attr(program, dist_context)
-
-    # Initialize distributed attributes for all var and op node in program
-    dist_context.init_dist_attr_for_program()
-
-    # Initialize distributed attributes for all var and op node in graph
-    dist_context.init_dist_attr_for_graph()
-
-    # Complete process mesh for each node
-    all_nodes = list(dist_context.serial_graph.all_nodes())
+                total_reach_fix_point = True
 
-    def sort_key_fun(node):
-        first = -1
-        if node.is_op():
-            first = 0
-        else:
-            first = 1
-        second = -1
-        if node.is_op() and node.op() is not None:
-            second = node.op().id()
-        if node.is_var() and node.var() is not None:
-            second = node.var().id()
-        return (first, second)
-
-    all_nodes.sort(key=sort_key_fun)
-
-    reach_fix_point = False
-    while not reach_fix_point:
-        total_changed = False
-        reach_fwd_fix_point = False
-        reach_bwd_fix_point = False
-        while not reach_fwd_fix_point:
+    def _update_dims_mapping(self):
+        # Complete dims_mapping for each node
+        reach_fix_point = False
+        while not reach_fix_point:
             changed = False
-            for node in all_nodes:
-                if node.is_var() and node.var() is not None:
-                    tensor_changed = update_tensor_node_process_mesh(
-                        dist_context, node, fwd=True)
-                    if tensor_changed:
-                        changed = True
-                if node.is_op() and node.op() is not None:
-                    op_changed = update_op_node_process_mesh(
-                        dist_context, node, fwd=True)
-                    if op_changed:
-                        changed = True
+            for is_fwd in [True, False]:
+                all_nodes = self._dist_context.serial_ordered_nodes \
+                    if is_fwd else reversed(self._dist_context.serial_ordered_nodes)
+                for node in all_nodes:
+                    if node.is_var() and node.var() is not None:
+                        tensor_changed = self._update_tensor_node_dims_mapping(
+                            node, fwd=is_fwd)
+                        if tensor_changed:
+                            changed = True
+                    if node.is_op() and node.op() is not None:
+                        op_changed = self._update_op_node_dims_mapping(
+                            node, fwd=is_fwd)
+                        if op_changed:
+                            changed = True
             if changed:
-                reach_fwd_fix_point = False
-                total_changed = True
+                reach_fix_point = False
             else:
-                reach_fwd_fix_point = True
-        while not reach_bwd_fix_point:
-            changed = False
-            for node in all_nodes:
-                if node.is_var() and node.var() is not None:
-                    tensor_changed = update_tensor_node_process_mesh(
-                        dist_context, node, fwd=False)
-                    if tensor_changed:
-                        changed = True
-                if node.is_op() and node.op() is not None:
-                    op_changed = update_op_node_process_mesh(
-                        dist_context, node, fwd=False)
-                    if op_changed:
-                        changed = True
-            if changed:
-                reach_bwd_fix_point = False
-                total_changed = True
-            else:
-                reach_bwd_fix_point = True
-        if total_changed:
-            reach_fix_point = False
-        else:
-            reach_fix_point = True
-            # Validation the completion of process meshes and should be moved to a proper location
-            is_wrong = False
-            for node in all_nodes:
-                if node.is_var() and node.var() is not None:
-                    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                        node)
-                    if tensor_dist_attr.process_mesh is None:
-                        msg_str = ""
-                        for op_node in node.inputs:
-                            if op_node.op() is not None:
-                                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                                    op_node)
-                                msg_str += "{} [{}], ".format(
-                                    op_node.op().type(),
-                                    op_dist_attr.process_mesh)
-                            else:
-                                msg_str += "{} [{}], ".format(op_node.name(),
-                                                              None)
-                        for op_node in node.outputs:
-                            if op_node.op() is not None:
-                                op_dist_attr = dist_context.get_op_dist_attr_for_graph(
-                                    op_node)
-                                msg_str += "{} [{}], ".format(
-                                    op_node.op().type(),
-                                    op_dist_attr.process_mesh)
-                            else:
-                                msg_str += "{} [{}], ".format(op_node.name(),
-                                                              None)
-                        msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_tensor api explicitly to annotate it".format(
-                            node.var().name(), msg_str[:-2])
-                        is_wrong = True
-                        print(msg_str)
-                if node.is_op() and node.op() is not None:
-                    op_dist_attr = dist_context.get_op_dist_attr_for_graph(node)
-                    if op_dist_attr.process_mesh is None:
-                        msg_str = ""
-                        for tensor_node in node.inputs:
-                            if tensor_node.var() is not None:
-                                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                                    tensor_node)
-                                msg_str += "{} [{}], ".format(
-                                    tensor_node.var().name(),
-                                    tensor_dist_attr.process_mesh)
-                            else:
-                                msg_str += "{} [{}], ".format(
-                                    tensor_node.name(), None)
-                        for tensor_node in node.outputs:
-                            if tensor_node.var() is not None:
-                                tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(
-                                    tensor_node)
-                                msg_str += "{} [{}], ".format(
-                                    tensor_node.var().name(),
-                                    tensor_dist_attr.process_mesh)
-                            else:
-                                msg_str += "{} [{}], ".format(
-                                    tensor_node.name(), None)
-                        msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_op api explicitly to annotate it".format(
-                            node.op().type(), msg_str[:-2])
-                        is_wrong = True
-                        print(msg_str)
-                if node.is_op() and node.op() is None:
-                    print("op op is None", node.name())
-            if is_wrong:
-                assert False, "Cannot complete process_meshes of the program."
-
-    # Complete dims_mapping for each node
-    reach_fix_point = False
-    while not reach_fix_point:
-        changed = False
-        for node in all_nodes:
-            if node.is_var() and node.var() is not None:
-                tensor_changed = update_tensor_node_dims_mapping(
-                    dist_context, node, fwd=True)
-                if tensor_changed:
-                    changed = True
-            if node.is_op() and node.op() is not None:
-                op_changed = update_op_node_dims_mapping(
-                    dist_context, node, fwd=True)
-                if op_changed:
-                    changed = True
-        for node in reversed(all_nodes):
-            if node.is_var() and node.var() is not None:
-                tensor_changed = update_tensor_node_dims_mapping(
-                    dist_context, node, fwd=False)
-                if tensor_changed:
-                    changed = True
-            if node.is_op() and node.op() is not None:
-                op_changed = update_op_node_dims_mapping(
-                    dist_context, node, fwd=False)
-                if op_changed:
-                    changed = True
-        if changed:
-            reach_fix_point = False
-        else:
-            reach_fix_point = True
-
-    # Copy the corresponding distributed attribute from graph to program
-    dist_context.copy_dist_attr_from_graph_to_program()
-    dist_context.clear_dist_info_for_graph()
-
-    # Do the validation check and amend some completion
-    dist_context.amend_dist_attr_for_program()
-
-    # print_program_with_dist_attr(program, dist_context)
-    dist_context.validate_dist_attr_for_program()
+                reach_fix_point = True
+
+    def complete_forward_annotation(self, serial_main_program):
+        """ Complete annotation for the partial annotated serial_main_program.
+
+        Arguments:
+            serial_main_program: partial annotated serial_main_program.
+
+        Returns:
+            serial_main_program: completed annotated serial_main_program.
+        """
+
+        # Use the default distribted context for completeion if there is no one
+        self._dist_context.serial_program = serial_main_program
+
+        # Initialize distributed attributes for all var and op node in serial_main_program
+        self._dist_context.init_dist_attr_for_program()
+
+        # Initialize distributed attributes for all var and op node in graph
+        self._dist_context.init_dist_attr_for_graph()
+
+        self._update_process_mesh()
+
+        # Complete dims_mapping for each node
+        self._update_dims_mapping()
+
+        # Copy the corresponding distributed attribute from graph to serial_main_program
+        self._dist_context.copy_dist_attr_from_graph_to_program()
+        self._dist_context.clear_dist_info_for_graph()
+
+        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
+        # Do the validation check and amend some completion
+        self._dist_context.amend_dist_attr_for_program()
+
+        # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context)
+        self._dist_context.validate_dist_attr_for_program()
+
+        return serial_main_program
+
+    def complete_backward_annotation(self, serial_main_program):
+        """Complete the annotation of vars and ops in the backward phase for parallel program."""
+
+        def _is_grad_var_name(name):
+            if "@GRAD" in name:
+                return True
+            return False
+
+        def _get_forward_varname_from_grad_varname(grad_var_name):
+            assert _is_grad_var_name(
+                grad_var_name), "[{}] is not a grad varnme.".format(
+                    grad_var_name)
+            return grad_var_name[:grad_var_name.find("@GRAD")]
+
+        def _get_op_by_id(ops, id):
+            for op in ops:
+                if op.desc.id() == id:
+                    return op
+            return None
+
+        first_backward_op_idx = -1
+        for idx, op in enumerate(serial_main_program.global_block().ops):
+            if int(op.attr('op_role')) == int(
+                    int(core.op_proto_and_checker_maker.OpRole.Backward) | int(
+                        core.op_proto_and_checker_maker.OpRole.Loss)):
+                assert op.type == "fill_constant"
+                first_backward_op_idx = idx
+                break
+
+        assert first_backward_op_idx >= 0, "No backward procedure found in this program."
+
+        ops = list(serial_main_program.global_block().ops)
+        vars = serial_main_program.global_block().vars
+        dist_op_context = self._dist_context.dist_op_context
+
+        for idx in range(first_backward_op_idx, len(ops)):
+
+            # complete the initial grad loss op
+            if idx == first_backward_op_idx:
+                assert ops[idx].type == "fill_constant"
+                assert len(
+                    ops[idx].input_arg_names
+                ) == 0, "first backward op should has only ONE output, but got [{}]".format(
+                    len(ops[idx].input_arg_names))
+                assert len(
+                    ops[idx].output_arg_names
+                ) == 1, "first backward op should has only ONE output, but got [{}]".format(
+                    len(ops[idx].output_arg_names))
+
+                grad_var = vars[ops[idx].output_arg_names[0]]
+                forward_var_name = _get_forward_varname_from_grad_varname(
+                    grad_var.name)
+                forward_var = vars[forward_var_name]
+
+                # TODO complete other attribte for grad var
+                tensor_dist_attr = TensorDistributedAttribute()
+                process_mesh = self._dist_context.get_tensor_dist_attr_for_program(
+                    forward_var).process_mesh
+                dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                    forward_var).dims_mapping
+                tensor_dist_attr.dims_mapping = dims_mapping
+                tensor_dist_attr.process_mesh = process_mesh
+                self._dist_context.set_tensor_dist_attr_for_program(
+                    grad_var, tensor_dist_attr)
 
-    return program
-
-
-def complete_backward_annotation(auto_parallel_main_prog, dist_context=None):
-    """Complete the annotation of vars and ops in the backward phase for parallel program."""
-
-    def _is_grad_var_name(name):
-        if "@GRAD" in name:
-            return True
-        return False
-
-    def _get_forward_varname_from_grad_varname(grad_var_name):
-        assert _is_grad_var_name(
-            grad_var_name), "[{}] is not a grad varnme.".format(grad_var_name)
-        return grad_var_name[:grad_var_name.find("@GRAD")]
-
-    def _get_op_by_id(ops, id):
-        for op in ops:
-            if op.desc.id() == id:
-                return op
-        return None
+                op_dist_attr = OperatorDistributedAttribute()
+                op_dist_attr.process_mesh = process_mesh
+                op_dist_attr.set_output_dims_mapping(grad_var.name,
+                                                     dims_mapping)
+                self._dist_context.set_op_dist_attr_for_program(ops[idx],
+                                                                op_dist_attr)
+                continue
 
-    if dist_context is None:
-        dist_context = get_default_distributed_context()
-
-    first_backward_op_idx = -1
-    for idx, op in enumerate(auto_parallel_main_prog.global_block().ops):
-        if int(op.attr('op_role')) == int(
-                int(core.op_proto_and_checker_maker.OpRole.Backward) | int(
-                    core.op_proto_and_checker_maker.OpRole.Loss)):
-            assert op.type == "fill_constant"
-            first_backward_op_idx = idx
-            break
-
-    assert first_backward_op_idx >= 0, "No backward procedure found in this program."
-
-    ops = list(auto_parallel_main_prog.global_block().ops)
-    vars = auto_parallel_main_prog.global_block().vars
-    dist_op_context = dist_context.dist_op_context
-
-    for idx in range(first_backward_op_idx, len(ops)):
-
-        # complete the initial grad loss op
-        if idx == first_backward_op_idx:
-            assert ops[idx].type == "fill_constant"
-            assert len(
-                ops[idx].input_arg_names
-            ) == 0, "first backward op should has only ONE output, but got [{}]".format(
-                len(ops[idx].input_arg_names))
-            assert len(
-                ops[idx].output_arg_names
-            ) == 1, "first backward op should has only ONE output, but got [{}]".format(
-                len(ops[idx].output_arg_names))
-
-            grad_var = vars[ops[idx].output_arg_names[0]]
-            forward_var_name = _get_forward_varname_from_grad_varname(
-                grad_var.name)
-            forward_var = vars[forward_var_name]
-
-            # TODO complete other attribte for grad var
-            tensor_dist_attr = TensorDistributedAttribute()
-            process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                forward_var).process_mesh
-            dims_mapping = dist_context.get_tensor_dist_attr_for_program(
-                forward_var).dims_mapping
-            tensor_dist_attr.dims_mapping = dims_mapping
-            tensor_dist_attr.process_mesh = process_mesh
-            dist_context.set_tensor_dist_attr_for_program(grad_var,
-                                                          tensor_dist_attr)
-
-            op_dist_attr = OperatorDistributedAttribute()
-            op_dist_attr.process_mesh = process_mesh
-            op_dist_attr.set_output_dims_mapping(grad_var.name, dims_mapping)
-            dist_context.set_op_dist_attr_for_program(ops[idx], op_dist_attr)
-            continue
-
-        # complete the annotation of grad op (xxx_grad op or sum op)
-        # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
-        grad_op = ops[idx]
-        if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
-            # TODO support the case where one forward op corresponding to multiple xxx_grad op
-            forward_op = _get_op_by_id(
-                ops[:first_backward_op_idx],
-                dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
-            assert forward_op is not None
-
-            # op dist attr
-            forward_op_dist_attr = dist_context.get_op_dist_attr_for_program(
-                forward_op)
-            forward_op_process_mesh = forward_op_dist_attr.process_mesh
-            grad_op_dist_attr = OperatorDistributedAttribute()
-            grad_op_dist_attr.process_mesh = forward_op_process_mesh
-
-            # var 
-            for input_name in grad_op.input_arg_names:
-                input_var = vars[input_name]
-                ref_dims_mapping = None
-                if "@GRAD" in input_name:
-                    forward_name = _get_forward_varname_from_grad_varname(
-                        input_name)
-                    ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
-                        forward_name)
-                else:
-                    if forward_op_dist_attr.get_input_dims_mapping(input_name):
-                        ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+            # complete the annotation of grad op (xxx_grad op or sum op)
+            # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
+            grad_op = ops[idx]
+            if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
+                # TODO support the case where one forward op corresponding to multiple xxx_grad op
+                forward_op = _get_op_by_id(
+                    ops[:first_backward_op_idx],
+                    dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
+                assert forward_op is not None
+
+                # op dist attr
+                forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program(
+                    forward_op)
+                forward_op_process_mesh = forward_op_dist_attr.process_mesh
+                grad_op_dist_attr = OperatorDistributedAttribute()
+                grad_op_dist_attr.process_mesh = forward_op_process_mesh
+
+                # var
+                for input_name in grad_op.input_arg_names:
+                    input_var = vars[input_name]
+                    ref_dims_mapping = None
+                    if "@GRAD" in input_name:
+                        forward_name = _get_forward_varname_from_grad_varname(
                             input_name)
-                    else:
                         ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
-                            input_name)
-
-                assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
-                    input_var.name)
-                grad_op_dist_attr.set_input_dims_mapping(input_name,
-                                                         ref_dims_mapping)
-
-            for output_name in grad_op.desc.output_names():
-                assert len(grad_op.desc.output(output_name)) in [0, 1]
-                if _is_grad_var_name(output_name):
-                    input_name = _get_forward_varname_from_grad_varname(
-                        output_name)
-                else:
-                    assert grad_op.type in [
-                        "cast", "c_identity", "c_allreduce_sum"
-                    ]
-                    input_name = "X"
-                assert input_name in forward_op.desc.input_names(
-                ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format(
-                    output_name, grad_op.type, input_name)
-                if len(grad_op.desc.output(output_name)) == 1:
-                    # tensor dist attr
-                    output_var = vars[grad_op.desc.output(output_name)[0]]
-                    forward_name = _get_forward_varname_from_grad_varname(
-                        output_var.name)
-                    ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
-                        forward_name)
-
-                    output_var_dist_attr = TensorDistributedAttribute()
-                    output_var_dist_attr.dims_mapping = ref_dims_mapping
-                    output_var_dist_attr.process_mesh = forward_op_process_mesh
-                    dist_context.set_tensor_dist_attr_for_program(
-                        output_var, output_var_dist_attr)
-
-                    grad_op_dist_attr.set_output_dims_mapping(output_var.name,
-                                                              ref_dims_mapping)
-
-            dist_context.set_op_dist_attr_for_program(grad_op,
-                                                      grad_op_dist_attr)
-
-        # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
-        else:
-            assert grad_op.type == "sum", "got unexpect op [{}]".format(
-                str(grad_op.type))
-            assert all(map(_is_grad_var_name, grad_op.input_arg_names))
-            assert len(grad_op.output_arg_names) == 1
-
-            ref_forward_var_name = _get_forward_varname_from_grad_varname(
-                grad_op.output_arg_names[0])
-            forward_var = vars[ref_forward_var_name]
-            ref_forward_var_dims_mapping = dist_context.get_tensor_dist_attr_for_program(
-                forward_var).dims_mapping
-            ref_forward_var_process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                forward_var).process_mesh
-
-            # output
-            tensor_dist_attr = TensorDistributedAttribute()
-            tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping
-            tensor_dist_attr.process_mesh = ref_forward_var_process_mesh
-            dist_context.set_tensor_dist_attr_for_program(
-                vars[grad_op.output_arg_names[0]], tensor_dist_attr)
-
-            # op
-            grad_op_dist_attr = OperatorDistributedAttribute()
-            grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh
-            for var_name in grad_op.input_arg_names:
-                assert _get_forward_varname_from_grad_varname(
-                    var_name) == ref_forward_var_name
-                grad_op_dist_attr.set_input_dims_mapping(
-                    var_name, ref_forward_var_dims_mapping)
-
-            grad_op_dist_attr.set_output_dims_mapping(
-                grad_op.output_arg_names[0], ref_forward_var_dims_mapping)
-            dist_context.set_op_dist_attr_for_program(grad_op,
-                                                      grad_op_dist_attr)
-
-
-def complete_update_annotation(auto_parallel_main_prog, dist_context):
-    """Complete the annotation of vars and ops in the update phase for parallel program."""
-
-    if dist_context is None:
-        dist_context = get_default_distributed_context()
-
-    ops = list(auto_parallel_main_prog.global_block().ops)
-    vars = auto_parallel_main_prog.global_block().vars
-    learning_rate_completed = False
-
-    for idx in range(len(ops)):
-
-        # complete the annotation of the optimizer op.
-        # TODO to add attribute for moment var
-        op = ops[idx]
-        if int(op.attr('op_role')) == int(OpRole.Optimize):
-            if op.type == "clip_by_norm":
-
-                param_grad = vars[op.input("X")[0]]
-                param_grad_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                    param_grad)
-                assert param_grad_dist_attr is not None
-                ref_process_mesh = param_grad_dist_attr.process_mesh
-                ref_dims_mapping = param_grad_dist_attr.dims_mapping
-
-                out = vars[op.output("Out")[0]]
-                out_dist_attr = TensorDistributedAttribute()
-                out_dist_attr.process_mesh = ref_process_mesh
-                out_dist_attr.dims_mapping = ref_dims_mapping
-                dist_context.set_tensor_dist_attr_for_program(out,
-                                                              out_dist_attr)
+                            forward_name)
+                    else:
+                        if forward_op_dist_attr.get_input_dims_mapping(
+                                input_name):
+                            ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+                                input_name)
+                        else:
+                            ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping(
+                                input_name)
+
+                    assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format(
+                        input_var.name)
+                    grad_op_dist_attr.set_input_dims_mapping(input_name,
+                                                             ref_dims_mapping)
 
-                op_dist_attr = OperatorDistributedAttribute()
-                op_dist_attr.process_mesh = ref_process_mesh
-                op_dist_attr.set_input_dist_attr(param_grad.name,
-                                                 param_grad_dist_attr)
-                op_dist_attr.set_output_dist_attr(out.name, out_dist_attr)
-                dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
-
-            if "Grad" in op.input_names and "Param" in ops[idx].input_names:
-                assert len(op.input(
-                    "Param")) == 1, "Only support one-to-one now."
-                assert len(op.input(
-                    "Grad")) == 1, "Only support one-to-one now."
-                param = vars[op.input("Param")[0]]
-                grad_var = vars[op.input("Grad")[0]]
-
-                param_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                    param)
-                assert param_dist_attr is not None
-                ref_process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                    param).process_mesh
-                assert ref_process_mesh is not None
-                ref_dims_mapping = dist_context.get_tensor_dist_attr_for_program(
-                    param).dims_mapping
-                assert ref_dims_mapping is not None
-                op_dist_attr = OperatorDistributedAttribute()
-                op_dist_attr.process_mesh = ref_process_mesh
-                op_dist_attr.set_input_dims_mapping(grad_var.name,
-                                                    ref_dims_mapping)
-                op_dist_attr.set_input_dims_mapping(param.name,
-                                                    ref_dims_mapping)
-                op_dist_attr.set_output_dims_mapping(param.name,
-                                                     ref_dims_mapping)
-                learning_var = vars[op.input("LearningRate")[0]]
-                op_dist_attr.set_input_dims_mapping(learning_var.name, [-1])
-                op_dist_attr.set_output_dims_mapping(learning_var.name, [-1])
-
-                if not learning_rate_completed:
-                    learning_rate_completed = True
-                    var_dist_attr = TensorDistributedAttribute()
-                    var_dist_attr.process_mesh = ref_process_mesh
-                    var_dist_attr.dims_mapping = [-1]
-                    dist_context.set_tensor_dist_attr_for_program(learning_var,
-                                                                  var_dist_attr)
-
-                for input_name in op.desc.input_names():
-
-                    if input_name in [
-                            'Param', 'Grad', 'LearningRate', "SkipUpdate",
-                            "Beta1Tensor", "Beta2Tensor", "EpsilonTensor",
-                            "MasterParam"
-                    ]:
-                        continue
+                for output_name in grad_op.desc.output_names():
+                    assert len(grad_op.desc.output(output_name)) in [0, 1]
+                    if _is_grad_var_name(output_name):
+                        input_name = _get_forward_varname_from_grad_varname(
+                            output_name)
+                    else:
+                        assert grad_op.type in [
+                            "cast", "c_identity", "c_allreduce_sum"
+                        ]
+                        input_name = "X"
+                    assert input_name in forward_op.desc.input_names(
+                    ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format(
+                        output_name, grad_op.type, input_name)
+                    if len(grad_op.desc.output(output_name)) == 1:
+                        # tensor dist attr
+                        output_var = vars[grad_op.desc.output(output_name)[0]]
+                        forward_name = _get_forward_varname_from_grad_varname(
+                            output_var.name)
+                        ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping(
+                            forward_name)
 
-                    assert len(op.desc.input(input_name)) == 1
-                    input_var = vars[op.desc.input(input_name)[0]]
-                    input_var_attr = TensorDistributedAttribute()
+                        output_var_dist_attr = TensorDistributedAttribute()
+                        output_var_dist_attr.dims_mapping = ref_dims_mapping
+                        output_var_dist_attr.process_mesh = forward_op_process_mesh
+                        self._dist_context.set_tensor_dist_attr_for_program(
+                            output_var, output_var_dist_attr)
 
-                    if "Beta1Pow" in input_name or "Beta2Pow" in input_name:
-                        input_var_attr.dims_mapping = [-1]
-                        op_dist_attr.set_input_dims_mapping(input_var.name,
-                                                            [-1])
-                        op_dist_attr.set_output_dims_mapping(input_var.name,
-                                                             [-1])
-                    else:
-                        assert "Moment" in input_name
-                        input_var_attr.dims_mapping = ref_dims_mapping
-                        op_dist_attr.set_input_dims_mapping(input_var.name,
-                                                            ref_dims_mapping)
-                        op_dist_attr.set_output_dims_mapping(input_var.name,
-                                                             ref_dims_mapping)
+                        grad_op_dist_attr.set_output_dims_mapping(
+                            output_var.name, ref_dims_mapping)
 
-                    input_var_attr.process_mesh = ref_process_mesh
-                    dist_context.set_tensor_dist_attr_for_program(
-                        input_var, input_var_attr)
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
 
-                dist_context.set_op_dist_attr_for_program(op, op_dist_attr)
-                continue
+            # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
+            else:
+                assert grad_op.type == "sum", "got unexpect op [{}]".format(
+                    str(grad_op.type))
+                assert all(map(_is_grad_var_name, grad_op.input_arg_names))
+                assert len(grad_op.output_arg_names) == 1
+
+                ref_forward_var_name = _get_forward_varname_from_grad_varname(
+                    grad_op.output_arg_names[0])
+                forward_var = vars[ref_forward_var_name]
+                ref_forward_var_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                    forward_var).dims_mapping
+                ref_forward_var_process_mesh = self._dist_context.get_tensor_dist_attr_for_program(
+                    forward_var).process_mesh
+
+                # output
+                tensor_dist_attr = TensorDistributedAttribute()
+                tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping
+                tensor_dist_attr.process_mesh = ref_forward_var_process_mesh
+                self._dist_context.set_tensor_dist_attr_for_program(
+                    vars[grad_op.output_arg_names[0]], tensor_dist_attr)
+
+                # op
+                grad_op_dist_attr = OperatorDistributedAttribute()
+                grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh
+                for var_name in grad_op.input_arg_names:
+                    assert _get_forward_varname_from_grad_varname(
+                        var_name) == ref_forward_var_name
+                    grad_op_dist_attr.set_input_dims_mapping(
+                        var_name, ref_forward_var_dims_mapping)
+
+                grad_op_dist_attr.set_output_dims_mapping(
+                    grad_op.output_arg_names[0], ref_forward_var_dims_mapping)
+                self._dist_context.set_op_dist_attr_for_program(
+                    grad_op, grad_op_dist_attr)
+
+    def complete_update_annotation(self, serial_main_program):
+        """Complete the annotation of vars and ops in the update phase for parallel program."""
+        ops = list(serial_main_program.global_block().ops)
+        vars = serial_main_program.global_block().vars
+        learning_rate_completed = False
+
+        for idx in range(len(ops)):
+
+            # complete the annotation of the optimizer op.
+            # TODO to add attribute for moment var
+            op = ops[idx]
+            if int(op.attr('op_role')) == int(OpRole.Optimize):
+
+                if "Grad" in op.input_names and "Param" in ops[idx].input_names:
+                    assert len(op.input(
+                        "Param")) == 1, "Only support one-to-one now."
+                    assert len(op.input(
+                        "Grad")) == 1, "Only support one-to-one now."
+                    param = vars[op.input("Param")[0]]
+                    grad_var = vars[op.input("Grad")[0]]
+
+                    param_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
+                        param)
+                    assert param_dist_attr is not None
+                    ref_process_mesh = self._dist_context.get_tensor_dist_attr_for_program(
+                        param).process_mesh
+                    assert ref_process_mesh is not None
+                    ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program(
+                        param).dims_mapping
+                    assert ref_dims_mapping is not None
+                    op_dist_attr = OperatorDistributedAttribute()
+                    op_dist_attr.process_mesh = ref_process_mesh
+                    op_dist_attr.set_input_dims_mapping(grad_var.name,
+                                                        ref_dims_mapping)
+                    op_dist_attr.set_input_dims_mapping(param.name,
+                                                        ref_dims_mapping)
+                    op_dist_attr.set_output_dims_mapping(param.name,
+                                                         ref_dims_mapping)
+                    learning_var = vars[op.input("LearningRate")[0]]
+                    op_dist_attr.set_input_dims_mapping(learning_var.name, [-1])
+                    op_dist_attr.set_output_dims_mapping(learning_var.name,
+                                                         [-1])
+
+                    if not learning_rate_completed:
+                        learning_rate_completed = True
+                        var_dist_attr = TensorDistributedAttribute()
+                        var_dist_attr.process_mesh = ref_process_mesh
+                        var_dist_attr.dims_mapping = [-1]
+                        self._dist_context.set_tensor_dist_attr_for_program(
+                            learning_var, var_dist_attr)
+
+                    for input_name in op.desc.input_names():
+
+                        if input_name in [
+                                'Param', 'Grad', 'LearningRate', "SkipUpdate",
+                                "Beta1Tensor", "Beta2Tensor", "EpsilonTensor",
+                                "MasterParam"
+                        ]:
+                            continue
+
+                        assert len(op.desc.input(input_name)) == 1
+                        input_var = vars[op.desc.input(input_name)[0]]
+                        input_var_attr = TensorDistributedAttribute()
+
+                        if "Beta1Pow" in input_name or "Beta2Pow" in input_name:
+                            input_var_attr.dims_mapping = [-1]
+                            op_dist_attr.set_input_dims_mapping(input_var.name,
+                                                                [-1])
+                            op_dist_attr.set_output_dims_mapping(input_var.name,
+                                                                 [-1])
+                        else:
+                            assert "Moment" in input_name
+                            input_var_attr.dims_mapping = ref_dims_mapping
+                            op_dist_attr.set_input_dims_mapping(
+                                input_var.name, ref_dims_mapping)
+                            op_dist_attr.set_output_dims_mapping(
+                                input_var.name, ref_dims_mapping)
+
+                        input_var_attr.process_mesh = ref_process_mesh
+                        self._dist_context.set_tensor_dist_attr_for_program(
+                            input_var, input_var_attr)
+
+                    self._dist_context.set_op_dist_attr_for_program(
+                        op, op_dist_attr)
+                    continue
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index ad3a53ff17d76..e06811df88179 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -247,23 +247,23 @@ def get_op_dist_attr_for_graph(self, serial_op_node):
     #     new_dist_op = DistributedOperator(dist_op.serial_op, dist_attr)
     #     self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
 
-    # def get_dist_attr_for_graph(self, serial_node):
-    #     if serial_node.is_var() and serial_node.var() is not None:
-    #         serial_tensor_node_id = serial_node.id()
-    #         dist_tensor = self._dist_tensors_for_graph.get(
-    #             serial_tensor_node_id, None)
-    #         if dist_tensor:
-    #             return dist_tensor.dist_attr
-    #         else:
-    #             return None
-    #     if serial_node.is_op() and serial_node.op() is not None:
-    #         serial_op_node_id = serial_node.id()
-    #         dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
-    #         if dist_op:
-    #             return dist_op.dist_attr
-    #         else:
-    #             return None
-    #     return None
+    def get_dist_attr_for_graph(self, serial_node):
+        if serial_node.is_var() and serial_node.var() is not None:
+            serial_tensor_node_id = serial_node.id()
+            dist_tensor = self._dist_tensors_for_graph.get(
+                serial_tensor_node_id, None)
+            if dist_tensor:
+                return dist_tensor.dist_attr
+            else:
+                return None
+        if serial_node.is_op() and serial_node.op() is not None:
+            serial_op_node_id = serial_node.id()
+            dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None)
+            if dist_op:
+                return dist_op.dist_attr
+            else:
+                return None
+        return None
 
     def init_dist_attr_for_program(self):
         assert self._serial_program, \
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index d6035d02953ac..43f5fa264790f 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -32,7 +32,7 @@
 from .dist_context import DistributedContext
 from .dist_context import get_default_distributed_context
 from .dist_context import set_default_distributed_context
-from .completion import complete_annotation, complete_backward_annotation, complete_update_annotation
+from .completion import Completer
 from .partitioner import Partitioner
 from .process_group import get_all_process_groups
 from .process_group import get_process_group
@@ -130,8 +130,8 @@ def _generate_backward(self, main_program, startup_program, loss,
                 no_grad_set,
                 callbacks,
                 distop_context=self._dist_context.dist_op_context)
-        complete_backward_annotation(
-            main_program, dist_context=self._dist_context)
+        self._completer = Completer(self._dist_context)
+        self._completer.complete_backward_annotation(main_program)
 
         return params_grads
 
@@ -142,8 +142,8 @@ def _apply_optimize(self, main_program, startup_program, params_grads):
                 params_grads)
 
         # update completion 
-        complete_update_annotation(
-            main_program, dist_context=self._dist_context)
+        self._completer = Completer(self._dist_context)
+        self._completer.complete_update_annotation(main_program)
 
         return optimize_ops
 
@@ -179,8 +179,9 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             # Annotation completion
             self._dist_context = DistributedContext()
             _logger.info("Start annotation dist attr.")
-            completed_main_program = complete_annotation(serial_main_program,
-                                                         self._dist_context)
+            self._completer = Completer(self._dist_context)
+            completed_main_program = self._completer.complete_forward_annotation(
+                serial_main_program)
         else:
             completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 05d71aca5db2c..bc4f1671f4e20 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -27,6 +27,7 @@
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix
@@ -154,10 +155,9 @@ def test_mlp_dp(self):
         dist_context = DistributedContext()
         train_program, start_program = mlp_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_mlp_mp(self):
@@ -171,10 +171,9 @@ def test_mlp_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = mlp_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_mlp_dp_mp(self):
@@ -189,10 +188,9 @@ def test_mlp_dp_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = mlp_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     # def test_mlp_misc(self):
@@ -212,8 +210,8 @@ def test_mlp_dp_mp(self):
     #     train_program, start_program = mlp_pretrain_forward(train_program,
     #                                                         start_program)
     #     # pdb.set_trace()
-    #     complete_train_program = auto.complete_annotation(train_program,
-    #                                                       dist_context)
+    #    completer = Completer(dist_context)
+    #     complete_train_program = auto.completer.complete_forward_annotation(train_program)
     #     # print_program_with_dist_attr(complete_train_program,
     #     #                                     dist_context)
     #     dist_context.finalize_distributed_attr_for_program(
@@ -423,8 +421,9 @@ def test_attn_dp(self):
         dist_context = DistributedContext()
         train_program, start_program = attn_pretrain_forward(train_program,
                                                              start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         # print_program_with_dist_attr(complete_train_program,
         #                                     dist_context)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
@@ -440,10 +439,9 @@ def test_attn_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = attn_pretrain_forward(train_program,
                                                              start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_attn_dp_mp(self):
@@ -458,10 +456,9 @@ def test_attn_dp_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = attn_pretrain_forward(train_program,
                                                              start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
@@ -747,10 +744,9 @@ def test_decoder_dp(self):
         dist_context = DistributedContext()
         train_program, start_program = decoder_pretrain_forward(train_program,
                                                                 start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_decoder_mp(self):
@@ -764,10 +760,9 @@ def test_decoder_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = decoder_pretrain_forward(train_program,
                                                                 start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_decoder_dp_mp(self):
@@ -782,10 +777,9 @@ def test_decoder_dp_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = decoder_pretrain_forward(train_program,
                                                                 start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index c2c1e63155c3a..1293a9644027d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -31,6 +31,7 @@
 from paddle.distributed.fleet import fleet
 import paddle.static as static
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
@@ -817,10 +818,9 @@ def test_gpt_dp(self):
         dist_context = DistributedContext()
         train_program, start_program = gpt_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_gpt_mp(self):
@@ -834,10 +834,9 @@ def test_gpt_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = gpt_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_gpt_dp_mp(self):
@@ -852,10 +851,9 @@ def test_gpt_dp_mp(self):
         dist_context = DistributedContext()
         train_program, start_program = gpt_pretrain_forward(train_program,
                                                             start_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index 83254de61298b..fd19a5bd8b866 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -23,6 +23,7 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
@@ -154,8 +155,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     parallelizer._dist_context = dist_context
 
     # serial forward & backward completion
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index b21cbb5ae78bc..27de9f325063b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -18,6 +18,7 @@
 import paddle
 from paddle.fluid import core
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
@@ -42,8 +43,9 @@ def get_dist_prog(train_program,
     parallelizer._dist_context = dist_context
 
     # serial forward & backward completion
-    complete_train_program = auto.complete_annotation(
-        train_program, dist_context
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program
     ) if complete_train_program is None else complete_train_program
 
     # parallelizer._apply_serial_forward_pass(complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 3a28595c833e0..9d4de771076cd 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -36,6 +36,7 @@
 from paddle.distributed import fleet
 
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.partitioner import Partitioner
@@ -433,6 +434,12 @@ def forward(self, input):
         out = F.gelu(out, approximate=True)
         out = self.linear1(out)
 
+        auto.shard_tensor(
+            out,
+            dist_attr={
+                "process_mesh": _global_process_mesh[1],
+                "dims_mapping": [0, -1]
+            })
         out = self.linear2(out)
         out = F.gelu(out, approximate=True)
         out = self.linear3(out)
@@ -476,8 +483,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     parallelizer._dist_context = dist_context
 
     # auto completion
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index 21cf8a904b690..deff2144411fc 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -28,6 +28,7 @@
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix
@@ -49,8 +50,9 @@ def get_programs(annotated_func):
     global _global_process_mesh
     dist_context.process_mesh = _global_process_mesh
     train_program, start_program = annotated_func(train_program, start_program)
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     rank_id = 3
     dist_strategy = fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index dc2ad1d900f52..01e62d886e2b7 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -31,6 +31,7 @@
 from paddle.distributed import fleet
 import paddle.static as static
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
@@ -881,8 +882,9 @@ def test_gpt_dp_mp(self):
         dist_context.process_mesh = _global_process_mesh
         train_program, startup_program, loss = gpt_pretrain_forward(
             train_program, startup_program)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
 
         # serial backward pass
         params_grads = parallelizer._generate_backward(
@@ -913,8 +915,9 @@ def test_gpt_dp_mp(self):
                   "w") as fw:
             fw.write(str(auto_parallel_startup_prog))
         # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw:
-        #     from paddle.distributed.auto_parallel.completion import complete_backward_annotation
-        #     complete_backward_annotation(auto_parallel_main_prog)
+        #     from paddle.distributed.auto_parallel.completion import Completer
+        #     completer = Completer()
+        #     completer.complete_forward_annotation(auto_parallel_main_prog)
         #     fw.write(str(auto_parallel_main_prog))       
         nrank = 4
         # col parallel
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 614b996d26521..b234e25823f4b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -22,6 +22,7 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
@@ -152,8 +153,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     parallelizer._dist_context = dist_context
 
     # serial forward & backward completion
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
@@ -299,7 +301,6 @@ def test_mlp_pp(self):
         for key in list(_g_process_group_map.keys()):
             del _g_process_group_map[key]
         reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context)
-        # print_program_with_dist_attr(dist_main_prog, dist_context)
 
         # check send and recv result
         self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index cfbb7653fad8e..40847a769033a 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -22,6 +22,7 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
@@ -116,8 +117,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     parallelizer._dist_context = dist_context
 
     # serial forward & backward completion
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index 272c1c212f08e..869bcd4c7ab32 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -22,6 +22,7 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
@@ -132,8 +133,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id):
     parallelizer._dist_context = dist_context
 
     # serial forward & backward completion
-    complete_train_program = auto.complete_annotation(train_program,
-                                                      dist_context)
+    completer = Completer(dist_context)
+    complete_train_program = completer.complete_forward_annotation(
+        train_program)
 
     params_grads = parallelizer._generate_backward(
         complete_train_program,
@@ -263,8 +265,9 @@ def test_allgather(self):
         dist_context = DistributedContext()
         dist_strategy = fleet.DistributedStrategy()
         partitioner = Partitioner(dist_context, rank_id)
-        complete_train_program = auto.complete_annotation(train_program,
-                                                          dist_context)
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
         partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition(
             complete_train_program, startup_program, [])
         reshard(partitioned_main_prog, partitioned_startup_prog, rank_id,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
index ed64fa0630fa1..78ad64b1dd852 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -154,7 +154,7 @@ def test_update(self):
         ops = train_program.global_block().ops
         vars = train_program.global_block().vars
         from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container
-        from paddle.distributed.auto_parallel.completion import is_elementwise_like_op
+        from paddle.distributed.auto_parallel.operators.common import is_elementwise_op
         from paddle.distributed.auto_parallel.dist_op import DistributedOperator
 
         for op in ops:
@@ -163,7 +163,7 @@ def test_update(self):
             if dist_op_impl_container is None:
                 op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                 dist_op = DistributedOperator(op, op_dist_attr)
-                if is_elementwise_like_op(op.type):
+                if is_elementwise_op(op.type):
                     changed = update_op_dims_mapping_by_elementwise_like_dist_impl(
                         dist_op)
                     self.assertFalse(changed)

From ba51a6c8101714dbd03a60830e79c64cb9af7bef Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Fri, 21 Jan 2022 12:05:09 +0800
Subject: [PATCH 02/15] fix gcd and lcm data type (#39043)

---
 python/paddle/tensor/math.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c4a92b1486d58..a476a8ccd120a 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3559,8 +3559,8 @@ def gcd(x, y, name=None):
         If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
-        y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+        x (Tensor): An N-D Tensor, the data type is int32，int64. 
+        y (Tensor): An N-D Tensor, the data type is int32，int64. 
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -3621,8 +3621,8 @@ def _gcd_body_fn(x, y):
 
         return x
     else:
-        check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
-        check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd')
+        check_variable_and_dtype(x, 'x', ['int32', 'int64'], 'gcd')
+        check_variable_and_dtype(y, 'y', ['int32', 'int64'], 'gcd')
         out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y])
         return out
 
@@ -3637,8 +3637,8 @@ def lcm(x, y, name=None):
         If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output).
 
     Args:
-        x (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
-        y (Tensor): An N-D Tensor, the data type is int8，int16，int32，int64，uint8. 
+        x (Tensor): An N-D Tensor, the data type is int32，int64. 
+        y (Tensor): An N-D Tensor, the data type is int32，int64. 
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:

From 4adeff06aebf2d824e361caced9f94506a68533b Mon Sep 17 00:00:00 2001
From: FlyingQianMM <245467267@qq.com>
Date: Fri, 21 Jan 2022 12:51:57 +0800
Subject: [PATCH 03/15] add block and grid loop for index_sample kernel to deal
 with a large-shape tensor (#37816)

* add block and grid loop for index_sample kernel to deal with a large-shape tensor

* fix code format

* limit grid dim
---
 paddle/fluid/operators/index_sample_op.cu | 63 +++++++++++++++--------
 1 file changed, 42 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 4260d0516e3cc..45f63c2b2fbd8 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -18,9 +18,22 @@
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
 namespace paddle {
 namespace operators {
 
+namespace {
+void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
+  dim3 max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
+                          .GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x;
+  grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y;
+}
+}
+
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
@@ -28,14 +41,15 @@ template <typename T, typename IndexT = int>
 __global__ void IndexSampleForward(const IndexT* index, const T* in_data,
                                    T* out_data, size_t index_length,
                                    size_t input_length, size_t batch_size) {
-  int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  int index_idx = index_j * index_length + index_i;
-  int in_idx = index_j * input_length + index_i;
-
-  if (index_i < index_length & index_j < batch_size) {
-    IndexT sample_idx = index[index_idx];
-    out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
+    }
   }
 }
 
@@ -44,18 +58,20 @@ __global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
                                 const T* out_grad, size_t index_length,
                                 size_t input_length, size_t batch_size,
                                 bool same_data_in_row = true) {
-  int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  int index_idx = index_j * index_length + index_i;
-  int in_idx = index_j * input_length + index_i;
-
-  if (index_i < index_length & index_j < batch_size) {
-    IndexT sample_idx = index[index_idx];
-    if (same_data_in_row) {
-      platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
-                              out_grad[sample_idx]);
-    } else {
-      in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      if (same_data_in_row) {
+        platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
+                                out_grad[sample_idx]);
+      } else {
+        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
+      }
     }
   }
 }
@@ -93,12 +109,14 @@ class IndexSampleKernel<platform::CUDADeviceContext, T>
     size_t index_length = index_dim[1];
 
     auto block_width = platform::RoundToPowerOfTwo(index_length);
+    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
     int block_height =
         platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-
+    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
     dim3 block_dim(block_width, block_height);
     dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                   (batch_size + block_dim.y - 1) / block_dim.y);
+    LimitGridDim(ctx, &grid_dim);
 
     if (index_type == framework::proto::VarType::INT64) {
       const int64_t* index_data = index->data<int64_t>();
@@ -150,11 +168,14 @@ class IndexSampleGradKernel<platform::CUDADeviceContext, T>
     bool same_data_in_index_row = index_length == 1 ? false : true;
 
     auto block_width = platform::RoundToPowerOfTwo(index_length);
+    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
     auto block_height =
         platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
+    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
     dim3 block_dim(block_width, block_height);
     dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
                   (batch_size + block_dim.y - 1) / block_dim.y);
+    LimitGridDim(ctx, &grid_dim);
 
     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();

From 89f903da1fba9527dc900266baf5a17e6711d7d8 Mon Sep 17 00:00:00 2001
From: ronnywang <524019753@qq.com>
Date: Fri, 21 Jan 2022 13:21:31 +0800
Subject: [PATCH 04/15] fix npu c_allgather int64 (#39099)

---
 paddle/fluid/operators/collective/c_allgather_op_npu.cc | 1 +
 paddle/fluid/platform/device/npu/hccl_helper.h          | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
index 4fa27f5eb9bee..5ebcc9064f790 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -79,5 +79,6 @@ namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel<int8_t>,
                        ops::CAllGatherOpASCENDKernel<int>,
+                       ops::CAllGatherOpASCENDKernel<int64_t>,
                        ops::CAllGatherOpASCENDKernel<float>,
                        ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h
index c2338fff02926..efbc56bee720b 100644
--- a/paddle/fluid/platform/device/npu/hccl_helper.h
+++ b/paddle/fluid/platform/device/npu/hccl_helper.h
@@ -41,6 +41,8 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
     return HCCL_DATA_TYPE_FP32;
   } else if (type == framework::proto::VarType::FP16) {
     return HCCL_DATA_TYPE_FP16;
+  } else if (type == framework::proto::VarType::INT64) {
+    return HCCL_DATA_TYPE_INT64;
   } else if (type == framework::proto::VarType::INT32) {
     return HCCL_DATA_TYPE_INT32;
   } else if (type == framework::proto::VarType::INT8) {

From cf6516ffab24cc6ebc8b167dba53567ab1e60eb6 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Fri, 21 Jan 2022 13:49:08 +0800
Subject: [PATCH 05/15] update recommend member (#39083)

* update recommend member, test=document_fix

* remove update of UB rule file, test=document_fix
---
 paddle/scripts/paddle_build.sh | 4 ++--
 tools/ci_op_benchmark.sh       | 4 ++--
 tools/test_ci_op_benchmark.sh  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 7a2fa58be4978..cf326a68e5948 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2347,11 +2347,11 @@ function collect_ccache_hits() {
 
 function test_op_benchmark() {
     # The PR will pass quickly when get approval from specific person.
-    # Xreki 12538138, luotao1 6836917, Avin0323 23427135
+    # Xreki 12538138, luotao1 6836917, ZzSean 32410583
     set +x
     approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
     if [ "${approval_line}" != "" ]; then
-        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917)
+        APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "TRUE" ]; then
             echo "==================================="
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 48074c205774c..f2f83c8dfbb8d 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -286,11 +286,11 @@ function gpu_op_benchmark {
 
 
 # The PR will pass quickly when get approval from specific person.
-# Xreki 12538138, luotao1 6836917, Avin0323 23427135
+# Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
 approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917)
+  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
   if [ "${APPROVALS}" == "TRUE" ]; then
     LOG "[INFO] ==================================="
diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh
index 25e613dd6bcd0..bf70d8bc3a495 100644
--- a/tools/test_ci_op_benchmark.sh
+++ b/tools/test_ci_op_benchmark.sh
@@ -273,7 +273,7 @@ function check_CHANGE_OP_MAP {
   done
   if [ $exit_code -ne 0 ]; then
     LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details."
-    LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR."
+    LOG "[INFO] Or you can apply for one RD (ZzSean(Recommend), Xreki, luotao1) approval to pass this PR."
     exit $exit_code
   fi
 }
@@ -317,11 +317,11 @@ function gpu_op_benchmark {
 }
 
 # The PR will pass quickly when get approval from specific person.
-# Xreki 12538138, luotao1 6836917, Avin0323 23427135
+# Xreki 12538138, luotao1 6836917, ZzSean 32410583
 set +x
 approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000)
 if [ -n "${approval_line}" ]; then
-  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917)
+  APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917)
   LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
   if [ "${APPROVALS}" == "TRUE" ]; then
     LOG "[INFO] ==================================="

From 4e23ba325db40a212ed30165143bcb5301bd106c Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 21 Jan 2022 14:55:00 +0800
Subject: [PATCH 06/15] [PTen]Migrate Dim and DDim from paddle::framework into
 pten namespace (#39053)

* Migrate Dim and DDim from paddle::framework into pten namespace

* fix paddle::framework::Array

* fix framework::Array
---
 paddle/fluid/framework/CMakeLists.txt         |   8 -
 paddle/fluid/framework/ddim.h                 | 230 +---------------
 paddle/fluid/framework/ddim_test.cc           |  84 ------
 paddle/fluid/framework/dim.h                  |  82 +-----
 .../amp/check_finite_and_unscale_op.h         |   2 +-
 .../operators/amp/update_loss_scaling_op.h    |   2 +-
 paddle/fluid/operators/bce_loss_op.cu         |   2 +-
 paddle/fluid/operators/bernoulli_op.h         |   2 +-
 paddle/fluid/operators/bilateral_slice_op.h   |   2 +-
 paddle/fluid/operators/bincount_op.cu         |   2 +-
 paddle/fluid/operators/deformable_conv_func.h |   2 +-
 paddle/fluid/operators/dequantize_log_op.cu   |   2 +-
 .../fluid/operators/detection/box_clip_op.cu  |   2 +-
 .../detection/sigmoid_focal_loss_op.cu        |   2 +-
 .../fluid/operators/detection/yolo_box_op.h   |   2 +-
 paddle/fluid/operators/distribution_helper.h  |   2 +-
 .../elementwise/elementwise_functor.h         |  34 +--
 paddle/fluid/operators/fake_quantize_op.h     |   2 +-
 paddle/fluid/operators/grid_sampler_op.h      |   2 +-
 paddle/fluid/operators/histogram_op.cu        |   2 +-
 paddle/fluid/operators/huber_loss_op.h        |   2 +-
 paddle/fluid/operators/interpolate_op.h       |   2 +-
 paddle/fluid/operators/interpolate_v2_op.h    |   2 +-
 .../kernel_primitives/datamover_primitives.h  |   3 +-
 paddle/fluid/operators/kldiv_loss_op.h        |   2 +-
 paddle/fluid/operators/lstm_unit_op.cu        |   2 +-
 paddle/fluid/operators/math.h                 |   2 +-
 paddle/fluid/operators/math/algorithm.h       |   2 +-
 .../fluid/operators/math/complex_functors.h   |   2 +-
 paddle/fluid/operators/math/cos_sim_functor.h |   2 +-
 paddle/fluid/operators/math/cross_entropy.h   |   2 +-
 paddle/fluid/operators/math/depthwise_conv.h  |   2 +-
 .../math/detail/activation_functions.h        |   2 +-
 .../fluid/operators/math/detail/gru_kernel.h  |   2 +-
 .../fluid/operators/math/detail/lstm_kernel.h |   2 +-
 paddle/fluid/operators/math/maxouting.h       |   2 +-
 paddle/fluid/operators/math/pooling.h         |   2 +-
 .../fluid/operators/modified_huber_loss_op.cu |   2 +-
 .../fluid/operators/modified_huber_loss_op.h  |   2 +-
 paddle/fluid/operators/multinomial_op.h       |   2 +-
 paddle/fluid/operators/nll_loss_op.cu         |   2 +-
 paddle/fluid/operators/roll_op.cu             |  14 +-
 .../sigmoid_cross_entropy_with_logits_op.cu   |   2 +-
 paddle/fluid/operators/smooth_l1_loss_op.h    |   2 +-
 paddle/fluid/operators/unstack_op.h           |   1 -
 paddle/fluid/platform/aligned_vector.h        |   2 +-
 paddle/fluid/platform/eigen_ext.h             |   2 +-
 paddle/fluid/platform/transform.h             |   2 +-
 paddle/fluid/platform/transform_test.cu       |   2 +-
 paddle/pten/api/include/tensor.h              |  10 +-
 paddle/pten/api/lib/tensor.cc                 |   6 +-
 paddle/pten/core/CMakeLists.txt               |   9 +
 paddle/{fluid/framework => pten/core}/array.h |  10 +-
 paddle/{fluid/framework => pten/core}/ddim.cc |  85 +++---
 paddle/pten/core/ddim.h                       | 257 ++++++++++++++++++
 paddle/pten/core/ddim_test.cc                 |  83 ++++++
 paddle/pten/core/dim.h                        | 100 +++++++
 .../framework => pten/core}/dim_test.cu       |  41 +--
 .../platform => pten/core}/hostdevice.h       |   5 +-
 paddle/pten/core/tensor_base.h                |   4 +-
 paddle/pten/core/tensor_meta.h                |   4 +-
 .../core}/unroll_array_ops.h                  |   8 +-
 .../core}/unroll_array_ops_test.cc            |   8 +-
 paddle/pten/infermeta/binary.cc               |   8 +-
 paddle/pten/infermeta/nullary.cc              |   4 +-
 paddle/pten/infermeta/unary.cc                |  28 +-
 paddle/pten/kernels/cpu/elementwise.h         |   4 +-
 paddle/pten/kernels/cpu/reduce.h              |   4 +-
 paddle/pten/kernels/empty_kernel.cc           |   2 +-
 paddle/pten/kernels/flatten_grad_kernel.cc    |   3 +-
 paddle/pten/kernels/funcs/common_shape.h      |   2 +-
 paddle/pten/kernels/funcs/elementwise_base.h  |  26 +-
 .../pten/kernels/funcs/elementwise_functor.h  |   2 +-
 paddle/pten/kernels/funcs/transpose.cc        |   6 +-
 paddle/pten/kernels/funcs/transpose.cu        |   6 +-
 paddle/pten/kernels/funcs/transpose.h         |   2 +-
 paddle/pten/kernels/gpu/elementwise.h         |  37 ++-
 paddle/pten/kernels/gpu/reduce.h              |  20 +-
 .../pten/kernels/impl/dot_grad_kernel_impl.h  |   4 +-
 paddle/pten/kernels/impl/full_kernel_impl.h   |   2 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |   4 +-
 paddle/pten/kernels/impl/matmul_kernel_impl.h |  10 +-
 paddle/pten/tests/api/test_cast_api.cc        |   2 +-
 paddle/pten/tests/api/test_conj_api.cc        |   2 +-
 paddle/pten/tests/api/test_dot_api.cc         |   2 +-
 paddle/pten/tests/api/test_elementwise_api.cc |   2 +-
 paddle/pten/tests/api/test_empty_api.cc       |   2 +-
 paddle/pten/tests/api/test_fill_api.cc        |   2 +-
 paddle/pten/tests/api/test_flatten_api.cc     |   2 +-
 paddle/pten/tests/api/test_matmul_api.cc      |   2 +-
 paddle/pten/tests/api/test_mean_api.cc        |   2 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   2 +-
 paddle/pten/tests/api/test_scale_api.cc       |   2 +-
 paddle/pten/tests/api/test_sum_api.cc         |   2 +-
 paddle/pten/tests/api/test_to_api.cc          |   2 +-
 .../pten/tests/kernels/test_cast_dev_api.cc   |   2 +-
 .../pten/tests/kernels/test_conj_dev_api.cc   |   2 +-
 .../pten/tests/kernels/test_copy_dev_api.cc   |   2 +-
 .../tests/kernels/test_creation_dev_api.cc    |   2 +-
 paddle/pten/tests/kernels/test_dot_dev_api.cc |   2 +-
 .../tests/kernels/test_elementwise_dev_api.cc |   2 +-
 .../tests/kernels/test_flatten_dev_api.cc     |   2 +-
 .../pten/tests/kernels/test_matmul_dev_api.cc |   2 +-
 .../pten/tests/kernels/test_mean_dev_api.cc   |   2 +-
 .../tests/kernels/test_reshape_dev_api.cc     |   2 +-
 .../pten/tests/kernels/test_scale_dev_api.cc  |   2 +-
 paddle/pten/tests/kernels/test_sum_dev_api.cc |   2 +-
 107 files changed, 734 insertions(+), 658 deletions(-)
 delete mode 100644 paddle/fluid/framework/ddim_test.cc
 rename paddle/{fluid/framework => pten/core}/array.h (94%)
 rename paddle/{fluid/framework => pten/core}/ddim.cc (77%)
 create mode 100644 paddle/pten/core/ddim.h
 create mode 100644 paddle/pten/core/ddim_test.cc
 create mode 100644 paddle/pten/core/dim.h
 rename paddle/{fluid/framework => pten/core}/dim_test.cu (62%)
 rename paddle/{fluid/platform => pten/core}/hostdevice.h (89%)
 rename paddle/{fluid/framework => pten/core}/unroll_array_ops.h (96%)
 rename paddle/{fluid/framework => pten/core}/unroll_array_ops_test.cc (92%)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 902943d14ff9d..83e5c1c17925e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -55,14 +55,6 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
 
 cc_library(string_array SRCS string_array.cc DEPS utf8proc)
 
-cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-if(WITH_GPU)
-  nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-elseif(WITH_ROCM)
-  hip_test(dim_test SRCS dim_test.cu DEPS ddim)
-endif()
-cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor)
 if(WITH_GPU)
diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h
index 565e0b430dfdc..d150cca9d4c67 100644
--- a/paddle/fluid/framework/ddim.h
+++ b/paddle/fluid/framework/ddim.h
@@ -14,237 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include <initializer_list>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/dim.h"
+#include "paddle/pten/core/ddim.h"
 
 namespace paddle {
 namespace framework {
 
-#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
-  case (rank): {                               \
-    constexpr auto kRank = (rank);             \
-    return (callback);                         \
-  }
-
-#define PADDLE_VISIT_DDIM(rank, callback)                                  \
-  switch (rank) {                                                          \
-    PADDLE_VISIT_DDIM_BASE(0, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(1, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(2, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(3, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(4, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(5, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(6, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(7, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(8, callback);                                   \
-    PADDLE_VISIT_DDIM_BASE(9, callback);                                   \
-    default:                                                               \
-      PADDLE_THROW(platform::errors::Unimplemented(                        \
-          "Invalid dimension to be accessed. Now only supports access to " \
-          "dimension 0 to 9, but received dimension is %d.",               \
-          rank));                                                          \
-  }
-
-template <typename T1, typename T2>
-inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
-  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
-}
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-class DDim {
- public:
-  constexpr static int kMaxRank = 9;
-
-  DDim() : rank_(1) { dim_[0] = 0; }
-
-  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
-
-  DDim(const int* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  DDim(const int64_t* d, int n) : rank_(n) {
-    dynamic_dim_assign(d, dim_.GetMutable(), n);
-  }
-
-  template <int D>
-  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
-    UnsafeCast<D>() = in;
-  }
-
-  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
-      : DDim(init_list.begin(), init_list.size()) {}
-
-  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
-
-  template <int D>
-  inline DDim& operator=(const Dim<D>& dim) {
-    rank_ = D;
-    UnsafeCast<D>() = dim;
-    return *this;
-  }
-
-  inline int64_t& operator[](int idx) { return dim_[idx]; }
-
-  inline int64_t operator[](int idx) const { return dim_[idx]; }
-
-  int64_t& at(int idx) {
-    PADDLE_ENFORCE_GE(idx, 0,
-                      platform::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_, idx));
-    PADDLE_ENFORCE_LT(idx, rank_,
-                      platform::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_, idx));
-    return dim_[idx];
-  }
-
-  int64_t at(int idx) const {
-    PADDLE_ENFORCE_GE(idx, 0,
-                      platform::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_, idx));
-    PADDLE_ENFORCE_LT(idx, rank_,
-                      platform::errors::InvalidArgument(
-                          "Invalid DDim index to be accessed. The valid index "
-                          "is between 0 and %d, but received index is %d.",
-                          rank_, idx));
-    return dim_[idx];
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  template <typename Visitor>
-  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
-      Visitor&& visitor) const {
-    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
-  }
-
-  bool operator==(const DDim& d) const;
-
-  bool operator!=(const DDim& d) const;
-
-  inline const int64_t* Get() const { return dim_.Get(); }
-
-  inline int64_t* GetMutable() { return dim_.GetMutable(); }
-
-  inline int size() const { return rank_; }
-
-  std::string to_str() const;
-
-  DDim reshape(const std::vector<int>& shape) const;
-
-  DDim transpose(const std::vector<int>& axis) const;
-
- private:
-  template <int D>
-  inline Dim<D>& UnsafeCast() {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<void*>(&dim_);
-    return *reinterpret_cast<Dim<D>*>(p);
-  }
-
-  template <int D>
-  inline const Dim<D>& UnsafeCast() const {
-    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
-    auto* p = static_cast<const void*>(&dim_);
-    return *reinterpret_cast<const Dim<D>*>(p);
-  }
-
-  inline DDim& CopyFrom(const DDim& ddim) {
-    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
-  }
-
-  friend DDim stride(const DDim& ddim);
-  friend DDim stride_numel(const DDim& ddim);
-
- private:
-  Dim<kMaxRank> dim_;
-  int rank_;
-};
-
-#undef PADDLE_VISIT_DDIM_BASE
-#undef PADDLE_VISIT_DDIM
-
-/**
- * \brief Make a DDim from std::vector<int64_t>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int64_t>& dims);
-
-DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int64_t> dims);
-
-template <typename T = int64_t>
-std::vector<T> vectorize(const DDim& ddim) {
-  std::vector<T> result(DDim::kMaxRank);
-  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
-  result.resize(ddim.size());
-  return result;
-}
-
-int64_t product(const DDim& ddim);
-
-bool contain_unknown_dim(const DDim& ddim);
-
-/**
- * \brief Slice a ddim
- *
- * Slice dim with [begin, end).
- * e.g.  DDim d = make_ddim({1,2,3,4,5});
- *       slice_ddim(d, 1, 3); ====> {2,3}
- */
-DDim slice_ddim(const DDim& dim, int begin, int end);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-std::ostream& operator<<(std::ostream&, const DDim&);
-
-/**
-* \brief Flatten dim to 3d
-* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
-*       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
-*/
-DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
-
-// Reshape a tensor to a matrix. The matrix's first dimension(column length)
-// will be the product of tensor's first `num_col_dims` dimensions.
-DDim flatten_to_2d(const DDim& src, int num_col_dims);
-
-DDim flatten_to_1d(const DDim& src);
-
-DDim stride(const DDim& ddim);
+using DDim = pten::framework::DDim;
+using namespace pten::framework;  // NOLINT
 
-DDim stride_numel(const DDim& ddim);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc
deleted file mode 100644
index e89f77ae496c4..0000000000000
--- a/paddle/fluid/framework/ddim_test.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <sstream>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/ddim.h"
-
-TEST(DDim, Equality) {
-  // construct a DDim from an initialization list
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // construct a DDim from a vector
-  std::vector<int64_t> vec({9, 1, 5});
-  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // mutate a DDim
-  ddim[1] = 2;
-  EXPECT_EQ(ddim[1], 2);
-  ddim[0] = 6;
-  EXPECT_EQ(ddim[0], 6);
-
-  // vectorize a DDim
-  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
-  EXPECT_EQ(res_vec[0], 9);
-  EXPECT_EQ(res_vec[1], 1);
-  EXPECT_EQ(res_vec[2], 5);
-  paddle::framework::Dim<3> d(3, 2, 1);
-  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
-  EXPECT_EQ(res_vec[0], 3);
-  EXPECT_EQ(res_vec[1], 2);
-  EXPECT_EQ(res_vec[2], 1);
-
-  // arity of a DDim
-  EXPECT_EQ(paddle::framework::arity(ddim), 3);
-  EXPECT_EQ(ddim.size(), 3);
-
-  // product of a DDim
-  EXPECT_EQ(paddle::framework::product(vddim), 45);
-  EXPECT_EQ(
-      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
-      90);
-
-  // slice a DDim
-  paddle::framework::DDim ddim2 =
-      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
-  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
-  EXPECT_EQ(arity(ss), 3);
-  EXPECT_EQ(ss[0], 3);
-  EXPECT_EQ(ss[1], 4);
-  EXPECT_EQ(ss[2], 5);
-  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
-  EXPECT_EQ(arity(ss2), 6);
-  EXPECT_EQ(ss2[0], 1);
-  EXPECT_EQ(ss2[1], 2);
-  EXPECT_EQ(ss2[2], 3);
-  EXPECT_EQ(ss2[3], 4);
-  EXPECT_EQ(ss2[4], 5);
-  EXPECT_EQ(ss2[5], 6);
-}
-
-TEST(DDim, Print) {
-  // print a DDim
-  std::stringstream ss;
-  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
-  ss << ddim;
-  EXPECT_EQ("2, 3, 4", ss.str());
-}
diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h
index 66214b265fdf9..6abae4e731832 100644
--- a/paddle/fluid/framework/dim.h
+++ b/paddle/fluid/framework/dim.h
@@ -12,89 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <type_traits>
-
-#include "paddle/fluid/framework/array.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/dim.h"
 
 namespace paddle {
 namespace framework {
-
-// Statically sized, statically indexed dimension
 template <int D>
-class Dim : public Array<int64_t, D> {
- public:
-  static_assert(D >= 0, "D must be not less than 0");
-
-  static constexpr int kRank = D;
-  using BaseClass = Array<int64_t, D>;
-
-  inline Dim(int64_t head, const Dim<D - 1>& tail) {
-    (*this)[0] = head;
-    new (this->GetMutable() + 1) Dim<D - 1>(tail);
-  }
-
-  template <typename... Args>
-  HOSTDEVICE explicit Dim(int64_t head, Args... args)
-      : BaseClass(head, args...) {}
-
-  /** Construct a Dim with each dimension set to the given index */
-  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
-
-  HOSTDEVICE Dim() = default;
-
-  HOST std::string to_string() const;
-};
-
-// Product of a Dim
-template <int D>
-HOSTDEVICE inline int64_t product(const Dim<D>& a) {
-  return UnrollProduct<D>::Run(a.Get());
-}
-
-/**
- * Helper function to create a Dim
- *
- * \param idxes The type of Dim constructed depends on the number of params
- *
- */
-
-template <typename... Args>
-HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
-  return Dim<sizeof...(Args)>(idxes...);
-}
-
-// Allows us to output a Dim
-template <int D>
-inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
-  os << d[0];
-  for (int i = 1; i < D; ++i) {
-    os << ", " << d[i];
-  }
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
-  return os;
-}
-
-template <int D>
-HOST std::string Dim<D>::to_string() const {
-  std::stringstream stream;
-  stream << *this;
-  return stream.str();
-}
-
-template <int D, typename T1, typename T2>
-inline void static_dim_assign(const T1* in, T2* out) {
-  UnrollAssign<D>::Run(in, out);
-}
+using Dim = pten::framework::Dim<D>;
+using namespace pten::framework;  // NOLINT
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
index 29b96c4a6704a..49ca2c3862a5e 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/isfinite_op.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h
index decc3c3b924c4..2c953d4eee373 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op.h
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu
index d493dad132992..6595d6deccd9a 100644
--- a/paddle/fluid/operators/bce_loss_op.cu
+++ b/paddle/fluid/operators/bce_loss_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h
index 40f285d11f194..da66742e08fd9 100644
--- a/paddle/fluid/operators/bernoulli_op.h
+++ b/paddle/fluid/operators/bernoulli_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h
index 0903fe4c71d3d..3ef13c421cdfb 100644
--- a/paddle/fluid/operators/bilateral_slice_op.h
+++ b/paddle/fluid/operators/bilateral_slice_op.h
@@ -13,7 +13,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index cf189193d1c11..5964b9e345e93 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/bincount_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h
index ba1c504430223..99d1d7c4776c3 100644
--- a/paddle/fluid/operators/deformable_conv_func.h
+++ b/paddle/fluid/operators/deformable_conv_func.h
@@ -24,7 +24,7 @@
 #pragma once
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 template <typename T>
 HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 39f4fdb71b69d..821b87bf0595a 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dequantize_log_op.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 17013efcc98b7..53727d9d08747 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/box_clip_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index 10c402e5a4078..7102c4cffe21a 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index e06c81052a0f4..31a67ecc26635 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -14,7 +14,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h
index 8bb963979e5a7..a13ae57090687 100644
--- a/paddle/fluid/operators/distribution_helper.h
+++ b/paddle/fluid/operators/distribution_helper.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 #if !defined(_WIN32)
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
index 8a6cadc2413dc..daca105ce46bb 100644
--- a/paddle/fluid/operators/elementwise/elementwise_functor.h
+++ b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/core/array.h"
 #include "paddle/pten/kernels/funcs/elementwise_functor.h"
 
 namespace paddle {
@@ -92,12 +92,12 @@ using Complex = paddle::platform::complex<T>;
 
 template <typename InT, typename OutT>
 struct DivGradXYFunctor {
-  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT a,
-                                                                 const InT b,
-                                                                 const InT c) {
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT a,
+                                                               const InT b,
+                                                               const InT c) {
     // dx = dout / y
     // dy = - dout * out / y
-    paddle::framework::Array<OutT, 2> outs;
+    pten::framework::Array<OutT, 2> outs;
     outs[0] = a / c;
     outs[1] = -a * b / c;
     return outs;
@@ -106,9 +106,9 @@ struct DivGradXYFunctor {
 
 template <typename InT, typename OutT>
 struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<Complex<OutT>, 2> operator()(
       const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    paddle::framework::Array<Complex<OutT>, 2> outs;
+    pten::framework::Array<Complex<OutT>, 2> outs;
     Complex<InT> c_conj(c.real, -c.imag);
     Complex<InT> out_div_c_conj((b / c).real, -(b / c).imag);
     outs[0] = a / c_conj;
@@ -247,9 +247,9 @@ struct MinGradYFunctor {
 
 template <typename InT, typename OutT>
 struct MinGradXYFunctor {
-  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
       const InT& x, const InT& y, const InT& dout) {
-    paddle::framework::Array<OutT, 2> outs;
+    pten::framework::Array<OutT, 2> outs;
     // dx = dout * (x < y)
     outs[0] = static_cast<OutT>(dout * static_cast<InT>(x < y));
     // dy = dout * (x >= y)
@@ -273,10 +273,10 @@ struct MulGradFunctor<Complex<T>> {
 
 template <typename InT, typename OutT>
 struct MulGradXYFunctor {
-  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT a,
-                                                                 const InT b,
-                                                                 const InT c) {
-    paddle::framework::Array<OutT, 2> outs;
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(const InT a,
+                                                               const InT b,
+                                                               const InT c) {
+    pten::framework::Array<OutT, 2> outs;
     // dx = dout * y
     outs[0] = a * b;
     // dy = dout * x
@@ -287,9 +287,9 @@ struct MulGradXYFunctor {
 
 template <typename InT, typename OutT>
 struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
-  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<Complex<OutT>, 2> operator()(
       const Complex<InT> a, const Complex<InT> b, const Complex<InT> c) {
-    paddle::framework::Array<Complex<OutT>, 2> outs;
+    pten::framework::Array<Complex<OutT>, 2> outs;
     // dx = dout * y
     Complex<InT> b_conj(b.real, -b.imag);
     outs[0] = a * b_conj;
@@ -316,9 +316,9 @@ struct MaxGradYFunctor {
 
 template <typename InT, typename OutT>
 struct MaxGradXYFunctor {
-  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(
+  inline HOSTDEVICE pten::framework::Array<OutT, 2> operator()(
       const InT& x, const InT& y, const InT& dout) {
-    paddle::framework::Array<OutT, 2> outs;
+    pten::framework::Array<OutT, 2> outs;
     // dx = dout * (x > y)
     outs[0] = static_cast<OutT>(dout * static_cast<InT>(x > y));
     // dy = dout * (x <= y)
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 21e7079ff6233..c31139611e84c 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index da386052c7dc0..a595e5078b21d 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index 2bf259f7d7a7a..a34f4b8a22e57 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/histogram_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
index 93cfba1964684..fbfed71e1ecd4 100644
--- a/paddle/fluid/operators/huber_loss_op.h
+++ b/paddle/fluid/operators/huber_loss_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index baa292319d36e..0c0dde6bd4536 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -15,7 +15,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index a5afb18b3ff6f..4d6189b57bf1c 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -15,7 +15,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
index ce45ed0301e92..45697073cbf85 100644
--- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
+++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h
@@ -20,6 +20,7 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+#include "paddle/pten/core/ddim.h"
 
 namespace paddle {
 namespace operators {
@@ -85,7 +86,7 @@ struct FastDivMod {
 template <int kDims>
 struct BroadcastConfig {
   FastDivMod divmoders[kDims];
-  uint32_t strides[framework::DDim::kMaxRank];
+  uint32_t strides[pten::framework::DDim::kMaxRank];
   HOSTDEVICE BroadcastConfig() {}
 
   HOSTDEVICE BroadcastConfig(const std::vector<int64_t>& out_dims,
diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h
index 0bc53d7dd7b3b..40199677fe9a3 100644
--- a/paddle/fluid/operators/kldiv_loss_op.h
+++ b/paddle/fluid/operators/kldiv_loss_op.h
@@ -13,7 +13,7 @@
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 3949a066e0868..b758efb065209 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -19,7 +19,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
 #include "paddle/fluid/operators/lstm_unit_op.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h
index 3b28928a52892..f5ce5af70bd7a 100644
--- a/paddle/fluid/operators/math.h
+++ b/paddle/fluid/operators/math.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 #include "math.h"  // NOLINT
 
diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h
index 346c693a22d85..cbe1a03d90d85 100644
--- a/paddle/fluid/operators/math/algorithm.h
+++ b/paddle/fluid/operators/math/algorithm.h
@@ -18,7 +18,7 @@
 #include <cstdint>  // for int64_t
 #include <numeric>
 
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h
index 3214adb095376..48f16b87cbd66 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <type_traits>
 
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h
index 9a24bfc331266..61827af950bd5 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.h
+++ b/paddle/fluid/operators/math/cos_sim_functor.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <stdlib.h>
 
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index db19818951d7c..e7ac1760d3b9c 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h
index f88b4a6e41cf9..89a1efe133387 100644
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index 38bd1a3dadb63..def25a680cb95 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h
index d9be8e80658fa..603f5f3426f0d 100644
--- a/paddle/fluid/operators/math/detail/gru_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 // TODO(guosheng): refine code style in gru_kernel
 namespace paddle {
diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h
index 003ec194366c9..33dcde4590068 100644
--- a/paddle/fluid/operators/math/detail/lstm_kernel.h
+++ b/paddle/fluid/operators/math/detail/lstm_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <type_traits>
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h
index 50bddf73bc10c..ceeb85d6d36ef 100644
--- a/paddle/fluid/operators/math/maxouting.h
+++ b/paddle/fluid/operators/math/maxouting.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 4743f0dc9faf1..f0637a40b8cde 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu
index 3c85da3c52c6c..ea08dc8084abf 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cu
+++ b/paddle/fluid/operators/modified_huber_loss_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <thrust/tuple.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/modified_huber_loss_op.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h
index 398676ba74151..4f552edf97bbe 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.h
+++ b/paddle/fluid/operators/modified_huber_loss_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/fluid/operators/multinomial_op.h
index 14cfbd268389e..df4c2e9e7bbf6 100644
--- a/paddle/fluid/operators/multinomial_op.h
+++ b/paddle/fluid/operators/multinomial_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu
index 03af45634149d..e3c99afe820c2 100644
--- a/paddle/fluid/operators/nll_loss_op.cu
+++ b/paddle/fluid/operators/nll_loss_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/nll_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu
index 57986d262820d..7e8e37bd2ee8f 100644
--- a/paddle/fluid/operators/roll_op.cu
+++ b/paddle/fluid/operators/roll_op.cu
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #pragma once
-#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/roll_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/core/array.h"
 
 namespace paddle {
 namespace operators {
@@ -28,9 +28,9 @@ using LoDTensor = framework::LoDTensor;
 
 template <typename T, size_t Rank>
 __global__ void RollCudaKernel(const T* input, T* output, int64_t N,
-                               paddle::framework::Array<int64_t, Rank> shifts,
-                               paddle::framework::Array<int64_t, Rank> strides,
-                               paddle::framework::Array<int64_t, Rank> sizes) {
+                               pten::framework::Array<int64_t, Rank> shifts,
+                               pten::framework::Array<int64_t, Rank> strides,
+                               pten::framework::Array<int64_t, Rank> sizes) {
   int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx >= N) {
     return;
@@ -101,9 +101,9 @@ class RollKernel<platform::CUDADeviceContext, T>
 
 #define CALL_ROLL_CUDA_KERNEL(N)                                               \
   case N: {                                                                    \
-    paddle::framework::Array<int64_t, N> _strides;                             \
-    paddle::framework::Array<int64_t, N> _shifts;                              \
-    paddle::framework::Array<int64_t, N> _sizes;                               \
+    pten::framework::Array<int64_t, N> _strides;                               \
+    pten::framework::Array<int64_t, N> _shifts;                                \
+    pten::framework::Array<int64_t, N> _sizes;                                 \
     for (size_t idx = 0; idx < N; ++idx) {                                     \
       _strides[idx] = strides[idx];                                            \
       _shifts[idx] = shifts[idx];                                              \
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
index cc012230c1062..de29822b8d7fe 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -22,7 +22,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h
index efe3afba18e8f..e30b48b1500ed 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.h
+++ b/paddle/fluid/operators/smooth_l1_loss_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h
index cfd4d6bce8364..413470e3db5d4 100644
--- a/paddle/fluid/operators/unstack_op.h
+++ b/paddle/fluid/operators/unstack_op.h
@@ -20,7 +20,6 @@ limitations under the License. */
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
-#include "paddle/fluid/framework/array.h"
 #endif
 
 namespace paddle {
diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h
index 7d014f6bdcb0b..144c017414a5d 100644
--- a/paddle/fluid/platform/aligned_vector.h
+++ b/paddle/fluid/platform/aligned_vector.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 2b3d1693f6245..872a6cf062eef 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index cc9919d8366be..e3a391462878a 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -19,8 +19,8 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/hostdevice.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu
index 23f5865971246..32ec113d1f5e5 100644
--- a/paddle/fluid/platform/transform_test.cu
+++ b/paddle/fluid/platform/transform_test.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/core/hostdevice.h"
 
 template <typename T>
 class Scale {
diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h
index c26c9ce839458..d2afd703eaf2a 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -42,12 +42,12 @@ class DenseTensor;
 
 namespace pten {
 class TensorBase;
+namespace framework {
+class DDim;
+}  // namespace framework
 }  // namespace pten
 
 namespace paddle {
-namespace framework {
-class DDim;
-}
 
 namespace experimental {
 
@@ -159,9 +159,9 @@ class PADDLE_API Tensor final {
   /**
    * @brief Return the dimensions of Tensor.
    *
-   * @return paddle::framework::DDim
+   * @return pten::framework::DDim
    */
-  paddle::framework::DDim dims() const;
+  pten::framework::DDim dims() const;
 
   /**
    * @brief Return the shape (dimensions) of Tensor.
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index cb70d26f947b8..0ccc9c56dbff7 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -47,13 +47,13 @@ limitations under the License. */
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/pten/common/complex.h"
 #include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/ddim.h"
 
 namespace paddle {
 namespace experimental {
@@ -94,10 +94,10 @@ int64_t Tensor::numel() const { return impl_->numel(); }
 
 int64_t Tensor::size() const { return impl_->numel(); }
 
-paddle::framework::DDim Tensor::dims() const { return impl_->dims(); }
+pten::framework::DDim Tensor::dims() const { return impl_->dims(); }
 
 std::vector<int64_t> Tensor::shape() const {
-  return paddle::framework::vectorize<int64_t>(impl_->dims());
+  return pten::framework::vectorize<int64_t>(impl_->dims());
 }
 
 void Tensor::reshape(const std::vector<int64_t> &shape) {
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index facc9ac005662..eabc5a19babad 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -15,6 +15,15 @@ cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )
 
+cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+if(WITH_GPU)
+  nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+elseif(WITH_ROCM)
+  hip_test(dim_test SRCS dim_test.cu DEPS ddim)
+endif()
+
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
     add_dependencies(dense_tensor mkldnn)
diff --git a/paddle/fluid/framework/array.h b/paddle/pten/core/array.h
similarity index 94%
rename from paddle/fluid/framework/array.h
rename to paddle/pten/core/array.h
index 0ec9cb81129c2..86d222d2d57b3 100644
--- a/paddle/fluid/framework/array.h
+++ b/paddle/pten/core/array.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,10 +15,12 @@
 #pragma once
 
 #include <cstdint>
-#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/pten/core/unroll_array_ops.h"
+// TODO(paddle-dev): Need to modify into pten/core/enforce.h
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
+namespace pten {
+namespace platform = paddle::platform;
 namespace framework {
 
 template <typename T, size_t N>
@@ -146,4 +148,4 @@ class Array<T, 0> {
 };
 
 }  // namespace framework
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/fluid/framework/ddim.cc b/paddle/pten/core/ddim.cc
similarity index 77%
rename from paddle/fluid/framework/ddim.cc
rename to paddle/pten/core/ddim.cc
index 8bac8b7df6d2d..663f92a5bf8d0 100644
--- a/paddle/fluid/framework/ddim.cc
+++ b/paddle/pten/core/ddim.cc
@@ -1,22 +1,22 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/ddim.h"
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/ddim.h"
 #include <set>
-#include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
+namespace pten {
+namespace platform = paddle::platform;
 namespace framework {
 
 DDim make_ddim(std::initializer_list<int64_t> dims) {
@@ -82,10 +82,13 @@ bool contain_unknown_dim(const DDim& ddim) {
 
 DDim slice_ddim(const DDim& dim, int begin, int end) {
   PADDLE_ENFORCE_EQ(
-      (begin >= 0 && end <= dim.size()), true,
+      (begin >= 0 && end <= dim.size()),
+      true,
       platform::errors::InvalidArgument(
-          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", begin,
-          end, dim.size()));
+          "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.",
+          begin,
+          end,
+          dim.size()));
   // Constructor of DDim would check whether end - begin is valid
   return DDim(dim.Get() + begin, end - begin);
 }
@@ -108,27 +111,34 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
 }
 
 DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) {
-  PADDLE_ENFORCE_GE(src.size(), 3,
+  PADDLE_ENFORCE_GE(src.size(),
+                    3,
                     platform::errors::InvalidArgument(
                         "The rank of src dim should be at least 3 "
                         "in flatten_to_3d, but received %d.",
                         src.size()));
-  PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()), true,
+  PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()),
+                    true,
                     platform::errors::InvalidArgument(
                         "The num_row_dims should be inside [1, %d] "
                         "in flatten_to_3d, but received %d.",
-                        src.size() - 1, num_row_dims));
-  PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()), true,
+                        src.size() - 1,
+                        num_row_dims));
+  PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()),
+                    true,
                     platform::errors::InvalidArgument(
                         "The num_col_dims should be inside [2, %d] "
                         "in flatten_to_3d, but received %d.",
-                        src.size(), num_col_dims));
+                        src.size(),
+                        num_col_dims));
   PADDLE_ENFORCE_GE(
-      num_col_dims, num_row_dims,
+      num_col_dims,
+      num_row_dims,
       platform::errors::InvalidArgument(
           "The num_row_dims should be less than num_col_dims in flatten_to_3d,"
           "but received num_row_dims = %d, num_col_dims = %d.",
-          num_row_dims, num_col_dims));
+          num_row_dims,
+          num_col_dims));
 
   return DDim({product(slice_ddim(src, 0, num_row_dims)),
                product(slice_ddim(src, num_row_dims, num_col_dims)),
@@ -169,13 +179,16 @@ DDim DDim::reshape(const std::vector<int>& shape) const {
   out_dims.rank_ = shape.size();
   for (size_t i = 0; i < shape.size(); ++i) {
     if (shape[i] == copy_dim_val) {
-      PADDLE_ENFORCE_LT(static_cast<int>(i), in_dims.size(),
+      PADDLE_ENFORCE_LT(static_cast<int>(i),
+                        in_dims.size(),
                         platform::errors::InvalidArgument(
                             "Index %d of shape under which the value of 0 "
                             "is stored, must be lower than the number of "
                             "old dimensions. But received shape[%d] = 0, "
                             "dimensions = %d, shape = [%s].",
-                            i, in_dims.size(), in_dims));
+                            i,
+                            in_dims.size(),
+                            in_dims));
       out_dims[i] = in_dims[i];
     } else {
       out_dims[i] = shape[i];
@@ -190,19 +203,23 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
   size_t axis_size = axis.size();
 
   auto axis_set = std::set<int>(axis.begin(), axis.end());
-  PADDLE_ENFORCE_EQ(axis_set.size(), axis_size,
+  PADDLE_ENFORCE_EQ(axis_set.size(),
+                    axis_size,
                     platform::errors::InvalidArgument(
                         "In an axis array, elements must be unique."));
 
   PADDLE_ENFORCE_EQ(
-      in_rank, axis_size,
+      in_rank,
+      axis_size,
       platform::errors::InvalidArgument("The input dimension's size "
                                         "should be equal to the axis's size. "
                                         "But received dimension is %d, "
                                         "axis's size is %d",
-                                        in_rank, axis_size));
+                                        in_rank,
+                                        axis_size));
 
-  PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size,
+  PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()),
+                    axis_size,
                     platform::errors::InvalidArgument(
                         "Axis values must be ranging from 0 to (dims - 1)."));
 
@@ -214,4 +231,4 @@ DDim DDim::transpose(const std::vector<int>& axis) const {
 }
 
 }  // namespace framework
-}  // namespace paddle
+}  // namespace pten
\ No newline at end of file
diff --git a/paddle/pten/core/ddim.h b/paddle/pten/core/ddim.h
new file mode 100644
index 0000000000000..148c32481c008
--- /dev/null
+++ b/paddle/pten/core/ddim.h
@@ -0,0 +1,257 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <initializer_list>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "paddle/pten/core/dim.h"
+
+namespace pten {
+namespace platform = paddle::platform;
+namespace framework {
+
+#define PADDLE_VISIT_DDIM_BASE(rank, callback) \
+  case (rank): {                               \
+    constexpr auto kRank = (rank);             \
+    return (callback);                         \
+  }
+
+#define PADDLE_VISIT_DDIM(rank, callback)                                  \
+  switch (rank) {                                                          \
+    PADDLE_VISIT_DDIM_BASE(0, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(1, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(2, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(3, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(4, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(5, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(6, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(7, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(8, callback);                                   \
+    PADDLE_VISIT_DDIM_BASE(9, callback);                                   \
+    default:                                                               \
+      PADDLE_THROW(platform::errors::Unimplemented(                        \
+          "Invalid dimension to be accessed. Now only supports access to " \
+          "dimension 0 to 9, but received dimension is %d.",               \
+          rank));                                                          \
+  }
+
+template <typename T1, typename T2>
+inline void dynamic_dim_assign(const T1* in, T2* out, int n) {
+  PADDLE_VISIT_DDIM(n, (static_dim_assign<kRank, T1, T2>(in, out)));
+}
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+class DDim {
+ public:
+  constexpr static int kMaxRank = 9;
+
+  DDim() : rank_(1) { dim_[0] = 0; }
+
+  DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); }
+
+  DDim(const int* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+
+  DDim(const int64_t* d, int n) : rank_(n) {
+    dynamic_dim_assign(d, dim_.GetMutable(), n);
+  }
+
+  template <int D>
+  /*implicit*/ DDim(const Dim<D>& in) : rank_(D) {  // NOLINT
+    UnsafeCast<D>() = in;
+  }
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list)
+      : DDim(init_list.begin(), init_list.size()) {}
+
+  inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); }
+
+  template <int D>
+  inline DDim& operator=(const Dim<D>& dim) {
+    rank_ = D;
+    UnsafeCast<D>() = dim;
+    return *this;
+  }
+
+  inline int64_t& operator[](int idx) { return dim_[idx]; }
+
+  inline int64_t operator[](int idx) const { return dim_[idx]; }
+
+  int64_t& at(int idx) {
+    PADDLE_ENFORCE_GE(idx,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_,
+                          idx));
+    PADDLE_ENFORCE_LT(idx,
+                      rank_,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_,
+                          idx));
+    return dim_[idx];
+  }
+
+  int64_t at(int idx) const {
+    PADDLE_ENFORCE_GE(idx,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_,
+                          idx));
+    PADDLE_ENFORCE_LT(idx,
+                      rank_,
+                      platform::errors::InvalidArgument(
+                          "Invalid DDim index to be accessed. The valid index "
+                          "is between 0 and %d, but received index is %d.",
+                          rank_,
+                          idx));
+    return dim_[idx];
+  }
+
+  template <typename Visitor>
+  typename std::result_of<Visitor(Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
+  }
+
+  template <typename Visitor>
+  typename std::result_of<Visitor(const Dim<0>&)>::type apply_visitor(
+      Visitor&& visitor) const {
+    PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast<kRank>()));
+  }
+
+  bool operator==(const DDim& d) const;
+
+  bool operator!=(const DDim& d) const;
+
+  inline const int64_t* Get() const { return dim_.Get(); }
+
+  inline int64_t* GetMutable() { return dim_.GetMutable(); }
+
+  inline int size() const { return rank_; }
+
+  std::string to_str() const;
+
+  DDim reshape(const std::vector<int>& shape) const;
+
+  DDim transpose(const std::vector<int>& axis) const;
+
+ private:
+  template <int D>
+  inline Dim<D>& UnsafeCast() {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<void*>(&dim_);
+    return *reinterpret_cast<Dim<D>*>(p);
+  }
+
+  template <int D>
+  inline const Dim<D>& UnsafeCast() const {
+    static_assert(D >= 0 && D <= kMaxRank, "Invalid rank");
+    auto* p = static_cast<const void*>(&dim_);
+    return *reinterpret_cast<const Dim<D>*>(p);
+  }
+
+  inline DDim& CopyFrom(const DDim& ddim) {
+    PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast<kRank>()));
+  }
+
+  friend DDim stride(const DDim& ddim);
+  friend DDim stride_numel(const DDim& ddim);
+
+ private:
+  Dim<kMaxRank> dim_;
+  int rank_;
+};
+
+#undef PADDLE_VISIT_DDIM_BASE
+#undef PADDLE_VISIT_DDIM
+
+/**
+ * \brief Make a DDim from std::vector<int64_t>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int64_t>& dims);
+
+DDim make_ddim(const std::vector<int>& dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int64_t> dims);
+
+template <typename T = int64_t>
+std::vector<T> vectorize(const DDim& ddim) {
+  std::vector<T> result(DDim::kMaxRank);
+  dynamic_dim_assign(ddim.Get(), result.data(), ddim.size());
+  result.resize(ddim.size());
+  return result;
+}
+
+int64_t product(const DDim& ddim);
+
+bool contain_unknown_dim(const DDim& ddim);
+
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */
+
+int arity(const DDim& ddim);
+
+std::ostream& operator<<(std::ostream&, const DDim&);
+
+/**
+* \brief Flatten dim to 3d
+* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6})
+*       flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30}
+*/
+DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims);
+
+// Reshape a tensor to a matrix. The matrix's first dimension(column length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim& src, int num_col_dims);
+
+DDim flatten_to_1d(const DDim& src);
+
+DDim stride(const DDim& ddim);
+
+DDim stride_numel(const DDim& ddim);
+}  // namespace framework
+}  // namespace pten
diff --git a/paddle/pten/core/ddim_test.cc b/paddle/pten/core/ddim_test.cc
new file mode 100644
index 0000000000000..1903bbfdff135
--- /dev/null
+++ b/paddle/pten/core/ddim_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+#include "paddle/pten/core/ddim.h"
+
+TEST(DDim, Equality) {
+  // construct a DDim from an initialization list
+  pten::framework::DDim ddim = pten::framework::make_ddim({9, 1, 5});
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // construct a DDim from a vector
+  std::vector<int64_t> vec({9, 1, 5});
+  pten::framework::DDim vddim = pten::framework::make_ddim(vec);
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // mutate a DDim
+  ddim[1] = 2;
+  EXPECT_EQ(ddim[1], 2);
+  ddim[0] = 6;
+  EXPECT_EQ(ddim[0], 6);
+
+  // vectorize a DDim
+  std::vector<int64_t> res_vec = pten::framework::vectorize(vddim);
+  EXPECT_EQ(res_vec[0], 9);
+  EXPECT_EQ(res_vec[1], 1);
+  EXPECT_EQ(res_vec[2], 5);
+  pten::framework::Dim<3> d(3, 2, 1);
+  res_vec = pten::framework::vectorize(pten::framework::DDim(d));
+  EXPECT_EQ(res_vec[0], 3);
+  EXPECT_EQ(res_vec[1], 2);
+  EXPECT_EQ(res_vec[2], 1);
+
+  // arity of a DDim
+  EXPECT_EQ(pten::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
+
+  // product of a DDim
+  EXPECT_EQ(pten::framework::product(vddim), 45);
+  EXPECT_EQ(pten::framework::product(pten::framework::make_ddim({3, 2, 5, 3})),
+            90);
+
+  // slice a DDim
+  pten::framework::DDim ddim2 = pten::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  pten::framework::DDim ss = pten::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  pten::framework::DDim ss2 = pten::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
+}
+
+TEST(DDim, Print) {
+  // print a DDim
+  std::stringstream ss;
+  pten::framework::DDim ddim = pten::framework::make_ddim({2, 3, 4});
+  ss << ddim;
+  EXPECT_EQ("2, 3, 4", ss.str());
+}
diff --git a/paddle/pten/core/dim.h b/paddle/pten/core/dim.h
new file mode 100644
index 0000000000000..8dd984891a894
--- /dev/null
+++ b/paddle/pten/core/dim.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+
+#include "paddle/pten/core/array.h"
+#include "paddle/pten/core/hostdevice.h"
+
+namespace pten {
+namespace framework {
+
+// Statically sized, statically indexed dimension
+template <int D>
+class Dim : public Array<int64_t, D> {
+ public:
+  static_assert(D >= 0, "D must be not less than 0");
+
+  static constexpr int kRank = D;
+  using BaseClass = Array<int64_t, D>;
+
+  inline Dim(int64_t head, const Dim<D - 1>& tail) {
+    (*this)[0] = head;
+    new (this->GetMutable() + 1) Dim<D - 1>(tail);
+  }
+
+  template <typename... Args>
+  HOSTDEVICE explicit Dim(int64_t head, Args... args)
+      : BaseClass(head, args...) {}
+
+  /** Construct a Dim with each dimension set to the given index */
+  HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); }
+
+  HOSTDEVICE Dim() = default;
+
+  HOST std::string to_string() const;
+};
+
+// Product of a Dim
+template <int D>
+HOSTDEVICE inline int64_t product(const Dim<D>& a) {
+  return UnrollProduct<D>::Run(a.Get());
+}
+
+/**
+ * Helper function to create a Dim
+ *
+ * \param idxes The type of Dim constructed depends on the number of params
+ *
+ */
+
+template <typename... Args>
+HOSTDEVICE inline Dim<sizeof...(Args)> make_dim(Args... idxes) {
+  return Dim<sizeof...(Args)>(idxes...);
+}
+
+// Allows us to output a Dim
+template <int D>
+inline std::ostream& operator<<(std::ostream& os, const Dim<D>& d) {
+  os << d[0];
+  for (int i = 1; i < D; ++i) {
+    os << ", " << d[i];
+  }
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) {
+  return os;
+}
+
+template <int D>
+HOST std::string Dim<D>::to_string() const {
+  std::stringstream stream;
+  stream << *this;
+  return stream.str();
+}
+
+template <int D, typename T1, typename T2>
+inline void static_dim_assign(const T1* in, T2* out) {
+  UnrollAssign<D>::Run(in, out);
+}
+
+}  // namespace framework
+}  // namespace pten
diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/pten/core/dim_test.cu
similarity index 62%
rename from paddle/fluid/framework/dim_test.cu
rename to paddle/pten/core/dim_test.cu
index b3c26b10c6ffb..0f8d71c5d3b4c 100644
--- a/paddle/fluid/framework/dim_test.cu
+++ b/paddle/pten/core/dim_test.cu
@@ -1,42 +1,43 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include <thrust/device_vector.h>
 #include <sstream>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/dim.h"
+#include "paddle/pten/core/dim.h"
 
-__global__ void test(paddle::framework::Dim<2>* o) {
-  o[0] = paddle::framework::make_dim(5, 6);
+__global__ void test(pten::framework::Dim<2>* o) {
+  o[0] = pten::framework::make_dim(5, 6);
 }
 
 __global__ void dyn_idx_gpu(int64_t* o) {
-  auto d = paddle::framework::make_dim(5, 6);
+  auto d = pten::framework::make_dim(5, 6);
   o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
   // construct a Dim on the CPU
-  auto a = paddle::framework::make_dim(3, 4);
+  auto a = pten::framework::make_dim(3, 4);
   EXPECT_EQ(a[0], 3);
   EXPECT_EQ(a[1], 4);
 
   // construct a Dim on the GPU
-  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  thrust::device_vector<pten::framework::Dim<2>> t(2);
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0,
-                     thrust::raw_pointer_cast(t.data()));
+  hipLaunchKernelGGL(
+      test, dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(t.data()));
 #else
   test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
 #endif
@@ -45,10 +46,10 @@ TEST(Dim, Equality) {
   EXPECT_EQ(a[1], 6);
 
   // product
-  EXPECT_EQ(paddle::framework::product(a), 30);
+  EXPECT_EQ(pten::framework::product(a), 30);
 
   // mutate a Dim
-  auto b = paddle::framework::make_dim(7, 8);
+  auto b = pten::framework::make_dim(7, 8);
   b[1] = 10;
   EXPECT_EQ(b[0], 7);
   EXPECT_EQ(b[1], 10);
@@ -61,8 +62,8 @@ TEST(Dim, Equality) {
   // dynamic access on GPU
   thrust::device_vector<int64_t> r(1);
 #ifdef PADDLE_WITH_HIP
-  hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0,
-                     thrust::raw_pointer_cast(r.data()));
+  hipLaunchKernelGGL(
+      dyn_idx_gpu, dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(r.data()));
 #else
   dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
 #endif
@@ -71,9 +72,9 @@ TEST(Dim, Equality) {
 }
 
 TEST(Dim, Bool) {
-  auto a = paddle::framework::make_dim(3, 4);
-  auto b = paddle::framework::make_dim(5, 6);
-  auto c = paddle::framework::make_dim(3, 4);
+  auto a = pten::framework::make_dim(3, 4);
+  auto b = pten::framework::make_dim(5, 6);
+  auto c = pten::framework::make_dim(3, 4);
 
   // comparison
   EXPECT_TRUE(a == a);
@@ -84,13 +85,13 @@ TEST(Dim, Bool) {
 TEST(Dim, Print) {
   {
     std::stringstream ss;
-    auto a = paddle::framework::make_dim(2, 3);
+    auto a = pten::framework::make_dim(2, 3);
     ss << a;
     EXPECT_EQ(ss.str(), "2, 3");
   }
   {
     std::stringstream ss;
-    ss << paddle::framework::make_dim(8);
+    ss << pten::framework::make_dim(8);
     EXPECT_EQ(ss.str(), "8");
   }
-}
+}
\ No newline at end of file
diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/pten/core/hostdevice.h
similarity index 89%
rename from paddle/fluid/platform/hostdevice.h
rename to paddle/pten/core/hostdevice.h
index 65005a5adbb1d..08fe3125287d7 100644
--- a/paddle/fluid/platform/hostdevice.h
+++ b/paddle/pten/core/hostdevice.h
@@ -1,16 +1,17 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+//     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #pragma once
 
 #ifdef __HIPCC__
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 528a52cee8da4..662553cbcb598 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/common/backend.h"
 #include "paddle/pten/common/data_type.h"
 #include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/utils/type_registry.h"
 
@@ -28,7 +28,7 @@ class TensorBase {
  public:
   using DataType = paddle::experimental::DataType;
   using DataLayout = paddle::experimental::DataLayout;
-  using DDim = paddle::framework::DDim;
+  using DDim = pten::framework::DDim;
   using Place = paddle::platform::Place;
 
   virtual ~TensorBase() = default;
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 2df6b48b674a7..ac3f17267c4f9 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/pten/common/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/ddim.h"
+#include "paddle/pten/core/ddim.h"
 
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
@@ -30,7 +30,7 @@ limitations under the License. */
 
 namespace pten {
 
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/pten/core/unroll_array_ops.h
similarity index 96%
rename from paddle/fluid/framework/unroll_array_ops.h
rename to paddle/pten/core/unroll_array_ops.h
index a9c047cc6c6ac..fb0358375a58e 100644
--- a/paddle/fluid/framework/unroll_array_ops.h
+++ b/paddle/pten/core/unroll_array_ops.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 #include <cstddef>
 #include <type_traits>
 
-#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/pten/core/hostdevice.h"
 
-namespace paddle {
+namespace pten {
 namespace framework {
 
 namespace detail {
@@ -130,4 +130,4 @@ template <size_t N>
 using UnrollProduct = detail::UnrollProduct<0, N, N == 0>;
 
 }  // namespace framework
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/pten/core/unroll_array_ops_test.cc
similarity index 92%
rename from paddle/fluid/framework/unroll_array_ops_test.cc
rename to paddle/pten/core/unroll_array_ops_test.cc
index c4fdfdb425f23..f32d94be759be 100644
--- a/paddle/fluid/framework/unroll_array_ops_test.cc
+++ b/paddle/pten/core/unroll_array_ops_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/unroll_array_ops.h"
+#include "paddle/pten/core/unroll_array_ops.h"
 
 #include <gtest/gtest.h>
 #include <array>
 
-namespace paddle {
+namespace pten {
 namespace framework {
 
 template <typename T>
@@ -79,4 +79,4 @@ TEST(unroll_ops, product) {
 }
 
 }  // namespace framework
-}  // namespace paddle
+}  // namespace pten
\ No newline at end of file
diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc
index ea587806bfcb2..083fb0fca2188 100644
--- a/paddle/pten/infermeta/binary.cc
+++ b/paddle/pten/infermeta/binary.cc
@@ -64,8 +64,8 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
                                 const DenseTensorMeta& y_meta,
                                 bool trans_x,
                                 bool trans_y) {
-  std::vector<int64_t> dims_x = paddle::framework::vectorize(x_meta.dims);
-  std::vector<int64_t> dims_y = paddle::framework::vectorize(y_meta.dims);
+  std::vector<int64_t> dims_x = pten::framework::vectorize(x_meta.dims);
+  std::vector<int64_t> dims_y = pten::framework::vectorize(y_meta.dims);
   auto ndims_x = dims_x.size();
   auto ndims_y = dims_y.size();
   PADDLE_ENFORCE_GT(ndims_x,
@@ -125,7 +125,7 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta,
     new_dims.push_back(1);
   }
 
-  auto ddim_out = paddle::framework::make_ddim(new_dims);
+  auto ddim_out = pten::framework::make_ddim(new_dims);
 
   return {x_meta.dtype, ddim_out, x_meta.layout};
 }
@@ -169,7 +169,7 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta,
                                   out_dims_array.data(),
                                   max_dim,
                                   axis);
-    return_meta.dims = paddle::framework::make_ddim(out_dims_array);
+    return_meta.dims = pten::framework::make_ddim(out_dims_array);
   }
   return_meta.lod = x_meta.lod;
   return return_meta;
diff --git a/paddle/pten/infermeta/nullary.cc b/paddle/pten/infermeta/nullary.cc
index 731e69e60907b..19e11f049fee7 100644
--- a/paddle/pten/infermeta/nullary.cc
+++ b/paddle/pten/infermeta/nullary.cc
@@ -20,14 +20,14 @@ namespace pten {
 DenseTensorMeta CreateInferMeta(const std::vector<int64_t>& shape,
                                 DataType dtype,
                                 DataLayout layout) {
-  const auto& out_dims = paddle::framework::make_ddim(shape);
+  const auto& out_dims = pten::framework::make_ddim(shape);
   return {dtype, out_dims, layout};
 }
 
 DenseTensorMeta CreateInferMeta(const ScalarArray& shape,
                                 DataType dtype,
                                 DataLayout layout) {
-  const auto& out_dims = paddle::framework::make_ddim(shape.GetData());
+  const auto& out_dims = pten::framework::make_ddim(shape.GetData());
   return {dtype, out_dims, layout};
 }
 
diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc
index 843a78f3413cf..27e1dc9511df2 100644
--- a/paddle/pten/infermeta/unary.cc
+++ b/paddle/pten/infermeta/unary.cc
@@ -23,7 +23,7 @@ DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta) {
 }
 
 DenseTensorMeta ReductionInferMeta(const DenseTensorMeta& x_meta) {
-  const auto& out_dims = paddle::framework::make_ddim({1});
+  const auto& out_dims = pten::framework::make_ddim({1});
   DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
   return return_meta;
 }
@@ -63,7 +63,7 @@ DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta,
   for (int i = stop_axis + 1; i < in_dims_size; i++) {
     out_shape.push_back(x_dims[i]);
   }
-  const auto& out_dims = paddle::framework::make_ddim(out_shape);
+  const auto& out_dims = pten::framework::make_ddim(out_shape);
   DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout);
 
   if (x_dims[0] == return_meta.dims[0]) {
@@ -89,10 +89,10 @@ DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta,
           layout == DataLayout::UNDEFINED ? x_meta.layout : layout};
 }
 
-static paddle::framework::DDim ValidateShape(
-    const std::vector<int64_t> shape, const paddle::framework::DDim& in_dims) {
-  const int64_t in_size = paddle::framework::product(in_dims);
-  auto in_dims_vec = paddle::framework::vectorize(in_dims);
+static pten::framework::DDim ValidateShape(
+    const std::vector<int64_t> shape, const pten::framework::DDim& in_dims) {
+  const int64_t in_size = pten::framework::product(in_dims);
+  auto in_dims_vec = pten::framework::vectorize(in_dims);
   bool all_positive = std::all_of(in_dims_vec.cbegin(),
                                   in_dims_vec.cend(),
                                   [](int64_t i) { return i > 0; });
@@ -112,7 +112,7 @@ static paddle::framework::DDim ValidateShape(
           paddle::platform::errors::InvalidArgument(
               "Only one dimension value of 'shape' in ReshapeOp can "
               "be -1. But received shape = [%s], shape[%d] is also -1.",
-              paddle::framework::make_ddim(shape),
+              pten::framework::make_ddim(shape),
               i));
       unk_dim_idx = i;
     } else if (shape[i] == copy_dim_val) {
@@ -124,7 +124,7 @@ static paddle::framework::DDim ValidateShape(
               "the input tensor X's dimensions. "
               "But received shape = [%s], shape[%d] = 0, X's shape = [%s], "
               "X's dimensions = %d.",
-              paddle::framework::make_ddim(shape),
+              pten::framework::make_ddim(shape),
               i,
               in_dims,
               in_dims.size()));
@@ -136,7 +136,7 @@ static paddle::framework::DDim ValidateShape(
               "Each dimension value of 'shape' in ReshapeOp must not "
               "be negative except one unknown dimension. "
               "But received  shape = [%s], shape[%d] = %d.",
-              paddle::framework::make_ddim(shape),
+              pten::framework::make_ddim(shape),
               i,
               shape[i]));
     }
@@ -165,7 +165,7 @@ static paddle::framework::DDim ValidateShape(
               "'shape' is [%s], known capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              paddle::framework::make_ddim(shape),
+              pten::framework::make_ddim(shape),
               capacity));
     } else {
       output_shape[unk_dim_idx] = -1;
@@ -183,7 +183,7 @@ static paddle::framework::DDim ValidateShape(
               "[%s], the capacity of 'shape' is %d.",
               in_dims,
               in_size,
-              paddle::framework::make_ddim(shape),
+              pten::framework::make_ddim(shape),
               capacity));
     }
   }
@@ -202,11 +202,11 @@ static paddle::framework::DDim ValidateShape(
             "capacity of 'Out' is %d.",
             in_dims,
             in_size,
-            paddle::framework::make_ddim(shape),
+            pten::framework::make_ddim(shape),
             capacity));
   }
 
-  return paddle::framework::make_ddim(output_shape);
+  return pten::framework::make_ddim(output_shape);
 }
 
 DenseTensorMeta InferMetaFromVecValue(const DenseTensorMeta& x_meta,
@@ -267,7 +267,7 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta,
       out_dim_vector.push_back(1);
     }
   }
-  DDim out_dim = paddle::framework::make_ddim(out_dim_vector);
+  DDim out_dim = pten::framework::make_ddim(out_dim_vector);
 
   DataType out_dtype;
   if (dtype != DataType::UNDEFINED) {
diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h
index f048678111cf2..e4f426d3f8eb4 100644
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
@@ -583,8 +583,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext& ctx,
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << paddle::framework::make_ddim(x_dims_array)
-          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+          << pten::framework::make_ddim(x_dims_array)
+          << " ydim:" << pten::framework::make_ddim(y_dims_array);
 
   CommonGradBroadcastCPU<T, DX_OP, DY_OP, Tout>(x,
                                                 y,
diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h
index b38f17aa02a55..86443c254bf67 100644
--- a/paddle/pten/kernels/cpu/reduce.h
+++ b/paddle/pten/kernels/cpu/reduce.h
@@ -50,13 +50,13 @@ void ReduceFunctor(const DeviceContext& context,
   DDim out_dims = output->dims();
   if (keep_dim && x_rank > 1) {
     const int kDelFlag = -2;
-    auto dims_vector = paddle::framework::vectorize(out_dims);
+    auto dims_vector = pten::framework::vectorize(out_dims);
     for (size_t i = 0; i < dims_ref.size(); ++i) {
       dims_vector[dims_ref[i]] = kDelFlag;
     }
     dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
                       dims_vector.end());
-    out_dims = paddle::framework::make_ddim(dims_vector);
+    out_dims = pten::framework::make_ddim(dims_vector);
   }
   auto& place = *context.eigen_device();
   Functor functor;
diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index d6a155dca0176..2deac0146c52c 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -24,7 +24,7 @@ template <typename T, typename Context>
 void EmptyKernel(const Context& dev_ctx,
                  const ScalarArray& shape,
                  DenseTensor* out) {
-  out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData()));
+  out->ResizeAndAllocate(pten::framework::make_ddim(shape.GetData()));
 }
 
 template <typename T, typename Context>
diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc
index e45ac516e16ed..cbbf62f1993e2 100644
--- a/paddle/pten/kernels/flatten_grad_kernel.cc
+++ b/paddle/pten/kernels/flatten_grad_kernel.cc
@@ -25,8 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx,
                        const DenseTensor& xshape,
                        DenseTensor* x_grad) {
   auto xshape_dims = xshape.dims();
-  auto x_dims =
-      paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  auto x_dims = pten::framework::slice_ddim(xshape_dims, 1, xshape_dims.size());
   pten::Copy(dev_ctx, out_grad, false, x_grad);
   x_grad->ResizeAndAllocate(x_dims);
 }
diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h
index 8693fd2b36c4e..6bb45ad199510 100644
--- a/paddle/pten/kernels/funcs/common_shape.h
+++ b/paddle/pten/kernels/funcs/common_shape.h
@@ -26,7 +26,7 @@ inline void SetXShape(const DenseTensor &x, DenseTensor *xshape) {
   for (int i = 0; i < in_dims.size(); ++i) {
     xshape_dims[i + 1] = in_dims[i];
   }
-  xshape->ResizeAndAllocate(paddle::framework::make_ddim(xshape_dims));
+  xshape->ResizeAndAllocate(pten::framework::make_ddim(xshape_dims));
   xshape->ResetLoD(x.meta().lod);
 }
 
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 7396c64de9eab..47924c4e2ae18 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -36,10 +36,10 @@ enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 };
    for supporting multiple-output feature in elementwise system.*/
 template <class T, int Num>
 using ConditionalT =
-    typename std::conditional_t<Num == 1, T, paddle::framework::Array<T, Num>>;
+    typename std::conditional_t<Num == 1, T, pten::framework::Array<T, Num>>;
 
 namespace funcs {
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 template <typename T, typename DX_OP, typename DY_OP, typename Tout = T>
 struct ElemwiseGradNoBroadcast {
@@ -303,9 +303,9 @@ inline DDim trim_trailing_singular_dims(const DDim &dims) {
     trim_dims[i] = dims[i];
   }
   if (trim_dims.size() == 0) {
-    return DDim(paddle::framework::make_dim());
+    return DDim(pten::framework::make_dim());
   }
-  DDim actual_dims = paddle::framework::make_ddim(trim_dims);
+  DDim actual_dims = pten::framework::make_ddim(trim_dims);
   return actual_dims;
 }
 
@@ -377,7 +377,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
                                     DenseTensor *dy,
                                     DX_OP dx_op,
                                     DY_OP dy_op) {
-  size_t N = static_cast<size_t>(paddle::framework::product(x_dim));
+  size_t N = static_cast<size_t>(pten::framework::product(x_dim));
   paddle::platform::ForRange<DeviceContext> for_range(dev_ctx, N);
   for_range(ElemwiseGradNoBroadcast<T, DX_OP, DY_OP, Tout>{
       x.data<T>(),
@@ -462,7 +462,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
 template <typename OutT, int VecSize, bool IsBoundary, int NumOuts>
 struct ElementwiseWriteDataCaller {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+      pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
       ConditionalT<OutT, NumOuts> src[VecSize],
       int block_offset,
       int num) {
@@ -485,7 +485,7 @@ struct ElementwiseWriteDataCaller {
 template <typename OutT, int VecSize, bool IsBoundary>
 struct ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, 1> {
   __device__ __forceinline__ void operator()(
-      paddle::framework::Array<_ptr_ OutT *, 1> outs,
+      pten::framework::Array<_ptr_ OutT *, 1> outs,
       OutT src[VecSize],
       int block_offset,
       int num) {
@@ -502,8 +502,8 @@ template <typename InT,
           int VecSize,
           bool IsBoundary>
 __device__ void VectorizedElementwiseKernelImpl(
-    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
-    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &in,
+    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int num,
     int data_offset,
     Functor func) {
@@ -537,8 +537,8 @@ template <typename InT,
           int NumOuts,
           int VecSize>
 __global__ void VectorizedElementwiseKernel(
-    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
-    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
     int size,
     int main_offset,
     Functor func) {
@@ -578,8 +578,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
                            std::vector<DenseTensor *> *outs,
                            Functor func) {
   auto numel = ins[0]->numel();
-  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
+  pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < Arity; ++i) {
     ins_data[i] = ins[i]->data<InT>();
diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h
index 6b89902456ac8..6d139d68530be 100644
--- a/paddle/pten/kernels/funcs/elementwise_functor.h
+++ b/paddle/pten/kernels/funcs/elementwise_functor.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/pten/common/float16.h"
+#include "paddle/pten/core/hostdevice.h"
 
 namespace pten {
 namespace funcs {
diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc
index 77d26fcbc3536..90a6859a85091 100644
--- a/paddle/pten/kernels/funcs/transpose.cc
+++ b/paddle/pten/kernels/funcs/transpose.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/funcs/transpose.h"
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -33,8 +33,8 @@ struct TransposeNormal<CPUContext, T> {
                   pten::DenseTensor* out,
                   const std::vector<int64_t>& axis) {
     const int rank = axis.size();
-    auto in_stride = paddle::framework::stride(in.dims());
-    auto out_stride = paddle::framework::stride(out->dims());
+    auto in_stride = pten::framework::stride(in.dims());
+    auto out_stride = pten::framework::stride(out->dims());
     const T* in_ptr = in.data<T>();
     T* out_ptr = out->mutable_data<T>();
 
diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu
index 045bfdbdb051c..474a7c4ea4de9 100644
--- a/paddle/pten/kernels/funcs/transpose.cu
+++ b/paddle/pten/kernels/funcs/transpose.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/funcs/transpose.h"
 
@@ -58,8 +58,8 @@ struct TransposeNormal<GPUContext, T> {
                   pten::DenseTensor* out,
                   const std::vector<int64_t>& axis) {
     const int rank = axis.size();
-    auto in_stride = paddle::framework::stride(in.dims());
-    auto out_stride = paddle::framework::stride(out->dims());
+    auto in_stride = pten::framework::stride(in.dims());
+    auto out_stride = pten::framework::stride(out->dims());
     auto* in_ptr = in.data<T>();
     auto* out_ptr = out->mutable_data<T>();
 
diff --git a/paddle/pten/kernels/funcs/transpose.h b/paddle/pten/kernels/funcs/transpose.h
index d0e4dafe2c3b8..0cb2b4289fe6e 100644
--- a/paddle/pten/kernels/funcs/transpose.h
+++ b/paddle/pten/kernels/funcs/transpose.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/fluid/framework/ddim.h"
+#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h
index c3ff91e7b15cd..def54e24840e7 100644
--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
@@ -130,14 +130,14 @@ struct DimensionsTransform {
 
  public:
   explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
-                               const paddle::framework::DDim &dims,
+                               const pten::framework::DDim &dims,
                                int axis) {
     const int N = ins.size();
     dim_size = dims.size();
-    out_dims = paddle::framework::vectorize<int64_t>(dims);
+    out_dims = pten::framework::vectorize<int64_t>(dims);
     in_dims.resize(N);
     for (int j = 0; j < N; ++j) {
-      in_dims[j] = paddle::framework::vectorize<int64_t>(ins[j]->dims());
+      in_dims[j] = pten::framework::vectorize<int64_t>(ins[j]->dims());
     }
     InputDimensionsExtend(N, axis);
 
@@ -214,11 +214,11 @@ template <typename InT,
           int Rank,
           bool IsBoundary = false>
 __device__ void ElementwiseBroadcastKernelImpl(
-    const paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> &ins,
-    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
-    const paddle::framework::Array<int, Arity> &use_broadcast,
+    const pten::framework::Array<const _ptr_ InT *__restrict__, Arity> &ins,
+    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    const pten::framework::Array<int, Arity> &use_broadcast,
     uint32_t numel,
-    const paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
+    const pten::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
         &configs,
     int num,
     int block_offset,
@@ -259,12 +259,11 @@ template <typename InT,
           int VecSize,
           int Rank>
 __global__ void ElementwiseBroadcastKernel(
-    paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
-    paddle::framework::Array<_ptr_ OutT *, NumOuts> outs,
-    paddle::framework::Array<int, Arity> use_broadcast,
+    pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins,
+    pten::framework::Array<_ptr_ OutT *, NumOuts> outs,
+    pten::framework::Array<int, Arity> use_broadcast,
     uint32_t numel,
-    paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity>
-        configs,
+    pten::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
     int main_offset,
     int tail_tid,
     Functor func) {
@@ -345,10 +344,10 @@ void LaunchKernel(const KPDevice &ctx,
                   Functor func,
                   DimensionsTransform merge_dims) {
   int numel = (*outs)[0]->numel();
-  paddle::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  paddle::framework::Array<int, Arity> use_broadcast;
-  paddle::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
-  paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
+  pten::framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
+  pten::framework::Array<int, Arity> use_broadcast;
+  pten::framework::Array<const _ptr_ InT *__restrict__, Arity> ins_data;
+  pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data;
 
   for (int i = 0; i < NumOuts; ++i) {
     outs_data[i] = (*outs)[i]->mutable_data<OutT>();
@@ -444,7 +443,7 @@ void LaunchBroadcastKernelForDifferentVecSize(
           "The maximum dimension of input tensor is expected to be less than "
           "%d, but recieved %d.\n",
           merge_dims.dim_size,
-          paddle::framework::DDim::kMaxRank));
+          pten::framework::DDim::kMaxRank));
     }
   }
 #undef CALL_BROADCAST_FOR_DIM_SIZE
@@ -1826,8 +1825,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx,
   }
 
   VLOG(3) << "CommonElementwiseBroadcastBackward xdims:"
-          << paddle::framework::make_ddim(x_dims_array)
-          << " ydim:" << paddle::framework::make_ddim(y_dims_array);
+          << pten::framework::make_ddim(x_dims_array)
+          << " ydim:" << pten::framework::make_ddim(y_dims_array);
 
   CommonGradBroadcastCUDA<T, DX_OP, DY_OP, Tout>(x,
                                                  y,
diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h
index e7d1d2d5f44fc..e247f786cc68d 100644
--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -32,7 +32,6 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
@@ -41,6 +40,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/core/array.h"
 
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
@@ -118,7 +118,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) {
 
 // convert dims from vector to array
 template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline paddle::framework::Array<T, ElementCount> VectorToArray(
+static inline pten::framework::Array<T, ElementCount> VectorToArray(
     const VectorLikeType& vec) {
   PADDLE_ENFORCE_LE(vec.size(),
                     ElementCount,
@@ -128,7 +128,7 @@ static inline paddle::framework::Array<T, ElementCount> VectorToArray(
                         vec.size(),
                         ElementCount));
   size_t n = static_cast<size_t>(vec.size());
-  paddle::framework::Array<T, ElementCount> ret;
+  pten::framework::Array<T, ElementCount> ret;
   for (size_t i = 0; i < n; ++i) {
     ret[i] = vec[i];
   }
@@ -162,7 +162,7 @@ static inline std::vector<int> GetReduceDim(const std::vector<int64_t>& dims,
 
 }  // namespace details
 
-constexpr int kMaxRank = paddle::framework::DDim::kMaxRank;
+constexpr int kMaxRank = pten::framework::DDim::kMaxRank;
 
 enum ReduceType {
   kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
@@ -202,9 +202,9 @@ struct IndexCalculator {
   }
 
   int dim;
-  paddle::framework::Array<int, kMaxRank> dims;
-  paddle::framework::Array<int, kMaxRank> strides;
-  paddle::framework::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
+  pten::framework::Array<int, kMaxRank> dims;
+  pten::framework::Array<int, kMaxRank> strides;
+  pten::framework::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
 };
 
 template <bool ReduceLastDim = false>
@@ -326,7 +326,7 @@ struct ReduceConfig {
                      const paddle::platform::Place& place,
                      pten::DenseTensor* tmp) {
     if (should_reduce_again) {
-      tmp->ResizeAndAllocate(paddle::framework::make_ddim(
+      tmp->ResizeAndAllocate(pten::framework::make_ddim(
           {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}));
       output_data = tmp->mutable_data<Ty>();
     } else {
@@ -1029,7 +1029,7 @@ static
   pten::DenseTensor tmp = pten::DenseTensor(
       pten::make_intrusive<paddle::experimental::SharedStorage>(place),
       pten::DenseTensorMeta(pten::DataType::UINT8,
-                            paddle::framework::make_ddim(
+                            pten::framework::make_ddim(
                                 {static_cast<int64_t>(temp_storage_bytes)})));
 
   auto* temp_storage = tmp.mutable_data<uint8_t>();
@@ -1073,7 +1073,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
   // Allocate memory
   y->mutable_data<Ty>();
 
-  auto x_dim = paddle::framework::vectorize<int>(x.dims());
+  auto x_dim = pten::framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
   config.Run();
   int numel = x.numel();
diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
index 39cdbad5146de..557f6fae7b7f9 100644
--- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h
@@ -103,7 +103,7 @@ struct DotGradFunction<DeviceContext,
       auto* data_dx = tensor_dx->mutable_data<T>();
       const auto* data_y = tensor_y->data<T>();
       const DDim& dim = tensor_x->dims();
-      size_t N = static_cast<size_t>(paddle::framework::product(dim));
+      size_t N = static_cast<size_t>(pten::framework::product(dim));
 
       auto step = dim[dim.size() - 1];
 
@@ -118,7 +118,7 @@ struct DotGradFunction<DeviceContext,
       auto* data_dy = tensor_dy->mutable_data<T>();
       const auto* data_x = tensor_x->data<T>();
       const DDim& dim = tensor_y->dims();
-      size_t N = static_cast<size_t>(paddle::framework::product(dim));
+      size_t N = static_cast<size_t>(pten::framework::product(dim));
 
       auto step = dim[dim.size() - 1];
 
diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h
index 134a815799de6..2900e2e83bd65 100644
--- a/paddle/pten/kernels/impl/full_kernel_impl.h
+++ b/paddle/pten/kernels/impl/full_kernel_impl.h
@@ -36,7 +36,7 @@ void FullKernel(const Context& dev_ctx,
                 const ScalarArray& shape,
                 const Scalar& val,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData()));
+  out->ResizeAndAllocate(pten::framework::make_ddim(shape.GetData()));
   FullValue<T>(dev_ctx, out, val.to<T>());
 }
 
diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
index b1bae78ddc5fa..71fadfae7deb8 100644
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
@@ -135,7 +135,7 @@ static DDim RowMatrixFromVector(const DDim& x_dim) {
   if (x_dim.size() > 1) {
     return x_dim;
   }
-  return paddle::framework::make_ddim({1, x_dim[0]});
+  return pten::framework::make_ddim({1, x_dim[0]});
 }
 
 /**
@@ -146,7 +146,7 @@ static DDim ColumnMatrixFromVector(const DDim& y_dim) {
   if (y_dim.size() > 1) {
     return y_dim;
   }
-  return paddle::framework::make_ddim({y_dim[0], 1});
+  return pten::framework::make_ddim({y_dim[0], 1});
 }
 
 /**
diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h
index 5ea9729655ecc..afe6bf71e2f6b 100644
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
@@ -164,7 +164,7 @@ void MatMulFunction(const Context& dev_ctx,
       std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
       out_dims.back() = y_dims.back();
     }
-    Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims));
+    Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
     Out->mutable_data<T>();
     if (trans_y) {
       const int M = Y.numel() / N;
@@ -242,7 +242,7 @@ void MatMulFunction(const Context& dev_ctx,
     } else {
       std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
     }
-    Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims));
+    Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims));
     Out->mutable_data<T>();
 
     if (trans_x) {
@@ -330,7 +330,7 @@ void MatMulFunction(const Context& dev_ctx,
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 
-  Out->ResizeAndAllocate(paddle::framework::make_ddim(out_broadcast_dims));
+  Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims));
   Out->mutable_data<T>();
 
   const int batch_dim = ndim - 2;
@@ -493,12 +493,12 @@ void MatmulKernel(const Context& dev_ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()),
+  PADDLE_ENFORCE_NE(pten::framework::product(x.dims()),
                     0,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(X) dims size must not be equal 0,"
                         " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()),
+  PADDLE_ENFORCE_NE(pten::framework::product(y.dims()),
                     0,
                     paddle::platform::errors::InvalidArgument(
                         "The Input(Y) dims size must not be equal 0,"
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index 6608d1ed08cab..0a3b56e3f18d4 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, cast) {
diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc
index 50d190257a16d..c17b0f23f4f6b 100644
--- a/paddle/pten/tests/api/test_conj_api.cc
+++ b/paddle/pten/tests/api/test_conj_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, conj) {
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 40e709b960334..97616d0cbcd57 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, dot) {
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 69af32eb457a6..17a6ffde9df0a 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, add) {
diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc
index f4e3f472c7990..f38e91b02b705 100644
--- a/paddle/pten/tests/api/test_empty_api.cc
+++ b/paddle/pten/tests/api/test_empty_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, empty_like) {
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 0d823765680e8..7910cc840f5ef 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 6c082b9653e6f..cf8fa9cb1895f 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, flatten) {
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index 03f686f1c3f5e..08e0e888b99ed 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -26,7 +26,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(API, matmul_cpu) {
   // 1. create tensor
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index 9d90e58101cbd..a7b85cff12cc1 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, mean) {
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 59e9e9fab1122..bfd1ea841443f 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, reshape) {
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
index 5ad52142765ba..bb5523d26c4e1 100644
--- a/paddle/pten/tests/api/test_scale_api.cc
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -24,7 +24,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 void CheckScaleResult(experimental::Tensor* out) {
   ASSERT_EQ(out->dims().size(), 2);
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index 5a7c9840e1114..c0d5a89eeb744 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, sum) {
diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc
index 9aef716029a69..fa999aace6678 100644
--- a/paddle/pten/tests/api/test_to_api.cc
+++ b/paddle/pten/tests/api/test_to_api.cc
@@ -25,7 +25,7 @@ namespace paddle {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 paddle::experimental::Tensor CreateInputTensor() {
   const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc
index 80328d0b243e8..3b1412a8e5f4e 100644
--- a/paddle/pten/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc
@@ -28,7 +28,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, cast) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc
index 6f2ea0602b81d..51066d8ae4783 100644
--- a/paddle/pten/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc
@@ -26,7 +26,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, conj) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc
index d690b29d71f6f..4f8bd727716ce 100644
--- a/paddle/pten/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc
@@ -26,7 +26,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
 // in 'paddle/api'
diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc
index b1c23d4a768e6..1aa21b847fac4 100644
--- a/paddle/pten/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc
@@ -27,7 +27,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, empty) {
   // 1. create input
diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc
index 4213240f57ba8..e4978d84c835c 100644
--- a/paddle/pten/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc
@@ -26,7 +26,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index 23583a843561b..0bc16371c0731 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -26,7 +26,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, add) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
index 13fc327b66945..78cd6261c3a41 100644
--- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc
@@ -36,7 +36,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, flatten) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
index 118215db505d5..76f7750319210 100644
--- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, dot) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc
index a8860540fd0c9..07ec30afad5ca 100644
--- a/paddle/pten/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, mean) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
index 52038593d7012..dc90043305ca0 100644
--- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(DEV_API, reshape) {
diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc
index 1c0be6c06aacd..106835a204c65 100644
--- a/paddle/pten/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, scale) {
   // 1. create tensor
diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc
index 2b11ba9595c53..41d694a025f42 100644
--- a/paddle/pten/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, sum) {
   // 1. create tensor

From 4f1fef60b175d5d5b19a2f2cdc8487888f8a1b9a Mon Sep 17 00:00:00 2001
From: TTerror <tangzhiyi11@users.noreply.github.com>
Date: Fri, 21 Jan 2022 15:18:01 +0800
Subject: [PATCH 07/15] refactor unittest for kunlun (#38772)

* refactor unittests for kunlun

* refactor unittests for kunlun, test=kunlun
---
 .../fluid/platform/device/xpu/xpu_op_list.cc  |  26 ++
 .../fluid/platform/device/xpu/xpu_op_list.h   |   8 +
 paddle/fluid/pybind/pybind.cc                 |   8 +
 paddle/scripts/paddle_build.sh                |   3 +
 .../unittests/xpu/get_test_cover_info.py      | 242 ++++++++++++++
 .../unittests/xpu/test_refactor_op_xpu.py     | 297 ++++++++++++++++++
 6 files changed, 584 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py

diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
index 448559a9edfee..36be4a55d0a6f 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc
@@ -74,6 +74,32 @@ bool is_in_xpu_black_list(const std::string& op_name) {
   return false;
 }
 
+std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
+                                                   XPUVersion version) {
+  std::vector<vartype::Type> res;
+  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  if (ops.find(op_name) != ops.end()) {
+    XPUKernelSet& type_set = ops[op_name];
+    for (auto& item : type_set) {
+      res.push_back(item.data_type_);
+    }
+  }
+  return res;
+}
+
+XPUOpListMap get_xpu_op_list(XPUVersion version) {
+  XPUOpListMap res;
+  auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops();
+  for (auto& op : ops) {
+    std::vector<vartype::Type> op_vartypes;
+    for (auto& item : op.second) {
+      op_vartypes.push_back(item.data_type_);
+    }
+    res[op.first] = std::move(op_vartypes);
+  }
+  return res;
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h
index 705f701e13634..3672d68492a6f 100644
--- a/paddle/fluid/platform/device/xpu/xpu_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h
@@ -12,6 +12,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include <string>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/op_kernel_type.h"
 
@@ -19,10 +20,17 @@ namespace paddle {
 namespace platform {
 
 using pOpKernelType = paddle::framework::OpKernelType;
+using vartype = paddle::framework::proto::VarType;
+using XPUOpListMap =
+    std::unordered_map<std::string, std::vector<vartype::Type>>;
 
 bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type);
 bool is_in_xpu_black_list(const std::string& op_name);
 
+std::vector<vartype::Type> get_xpu_op_support_type(const std::string& op_name,
+                                                   XPUVersion version);
+XPUOpListMap get_xpu_op_list(XPUVersion version);
+
 }  // namespace platform
 }  // namespace paddle
 #endif
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 176db6b48c5ed..cd999f17f3a2f 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -129,6 +129,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
 #endif
 
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
@@ -1762,6 +1763,13 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("get_xpu_device_count", platform::GetXPUDeviceCount);
   m.def("get_xpu_device_version",
         [](int device_id) { return platform::get_xpu_version(device_id); });
+  m.def("get_xpu_device_op_support_types",
+        [](const std::string &op_name, platform::XPUVersion version) {
+          return platform::get_xpu_op_support_type(op_name, version);
+        });
+  m.def("get_xpu_device_op_list", [](platform::XPUVersion version) {
+    return platform::get_xpu_op_list(version);
+  });
   m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool {
     // XPUs with Compute Capability > xpu2 support float16 and bfloat16
     return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1;
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index cf326a68e5948..384dfbf558f42 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1725,6 +1725,7 @@ function parallel_test_base_xpu() {
 EOF
 
 set +x
+        export XPU_OP_LIST_DIR=$tmp_dir
         ut_startTime_s=`date +%s`
         test_cases=$(ctest -N -V | grep "_xpu" )        # cases list which would be run exclusively
         get_quickly_disable_ut||disable_ut_quickly='disable_ut'   # indicate whether the case was in quickly disable list
@@ -1747,6 +1748,8 @@ set -x
         if [[ "$EXIT_CODE" != "0" ]]; then
             exit 8;
         fi
+        python ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
+        unset XPU_OP_LIST_DIR
     fi   
 }
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
new file mode 100644
index 0000000000000..31246436efae2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py
@@ -0,0 +1,242 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import inspect
+import os
+import fcntl
+
+import paddle
+import paddle.fluid.core as core
+
+type_dict_paddle_to_str = {
+    paddle.bool: 'bool',
+    paddle.uint8: 'uint8',
+    paddle.int8: 'int8',
+    paddle.int16: 'int16',
+    paddle.int32: 'int32',
+    paddle.int64: 'int64',
+    paddle.float16: 'float16',
+    paddle.float32: 'float32',
+    paddle.float64: 'float64',
+    paddle.complex128: 'complex128',
+    paddle.complex64: 'complex64',
+}
+
+type_dict_str_to_paddle = {
+    'int32': paddle.int32,
+    'int64': paddle.int64,
+    'float32': paddle.float32,
+    'float16': paddle.float16,
+    'bool': paddle.bool,
+    'uint8': paddle.uint8,
+    'int8': paddle.int8,
+    'complex128': paddle.complex128,
+    'complex64': paddle.complex64,
+    'int16': paddle.int16,
+}
+
+xpu_test_op_white_list = []
+xpu_test_type_white_list = []
+xpu_test_op_type_white_list = []
+xpu_test_device_op_white_list = []
+xpu_test_device_op_type_white_list = []
+
+
+class XPUOpTestWrapper(object):
+    def create_classes(self):
+        base_class = None
+        classes = []
+        return base_class, classes
+
+
+def get_op_white_list():
+    op_white_list = xpu_test_op_white_list
+    if os.getenv('XPU_TEST_OP_WHITE_LIST') is not None:
+        op_white_list.extend(
+            os.getenv('XPU_TEST_OP_WHITE_LIST').strip().split(','))
+    return list(set(op_white_list))
+
+
+def get_type_white_list():
+    type_white_list = xpu_test_type_white_list
+    if os.getenv('XPU_TEST_TYPE_WHITE_LIST') is not None:
+        type_white_list.extend(
+            os.getenv('XPU_TEST_TYPE_WHITE_LIST').strip().split(','))
+    return list(set(type_white_list))
+
+
+def get_op_type_white_list():
+    op_type_white_list = xpu_test_op_type_white_list
+    if os.getenv('XPU_TEST_OP_TYPE_WHITE_LIST') is not None:
+        op_type_white_list.extend(
+            os.getenv('XPU_TEST_OP_TYPE_WHITE_LIST').strip().split(','))
+    return list(set(op_type_white_list))
+
+
+def get_device_op_white_list():
+    device_op_white_list = xpu_test_device_op_white_list
+    if os.getenv('XPU_TEST_DEVICE_OP_WHITE_LIST') is not None:
+        device_op_white_list.extend(
+            os.getenv('XPU_TEST_DEVICE_OP_WHITE_LIST').strip().split(','))
+    return list(set(device_op_white_list))
+
+
+def get_device_op_type_white_list():
+    device_op_type_white_list = xpu_test_device_op_type_white_list
+    if os.getenv('XPU_TEST_DEVICE_OP_TYPE_WHITE_LIST') is not None:
+        device_op_type_white_list.extend(
+            os.getenv('XPU_TEST_DEVICE_OP_TYPE_WHITE_LIST').strip().split(','))
+    return list(set(device_op_type_white_list))
+
+
+def make_xpu_op_list(xpu_version):
+    ops = []
+    raw_op_list = core.get_xpu_device_op_list(xpu_version)
+    version_str = "xpu2" if xpu_version == core.XPUVersion.XPU2 else "xpu1"
+    op_white_list = get_op_white_list()
+    type_white_list = get_type_white_list()
+    op_type_white_list = get_op_type_white_list()
+    device_op_white_list = get_device_op_white_list()
+    device_op_type_white_list = get_device_op_type_white_list()
+    print('op_white_list:', op_white_list)
+    print('type_white_list:', type_white_list)
+    print('op_type_white_list:', op_type_white_list)
+    print('device_op_white_list:', device_op_white_list)
+    print('device_op_type_white_list:', device_op_type_white_list)
+
+    for op_name, type_list in raw_op_list.items():
+        device_op_name = version_str + '_' + op_name
+        if op_name in op_white_list or device_op_name in device_op_white_list:
+            continue
+        for op_type in type_list:
+            if op_type in type_white_list or op_type not in type_dict_paddle_to_str.keys(
+            ):
+                continue
+
+            device_op_type_name = device_op_name + '_' + type_dict_paddle_to_str[
+                op_type]
+            if device_op_type_name in device_op_type_white_list:
+                continue
+
+            op_type_name = op_name + '_' + type_dict_paddle_to_str[op_type]
+            if op_type_name in op_type_white_list:
+                continue
+
+            ops.append(op_type_name)
+    return ops
+
+
+def get_xpu_op_support_types(op_name, dev_id=0):
+    xpu_version = core.get_xpu_device_version(dev_id)
+    support_type_list = core.get_xpu_device_op_support_types(op_name,
+                                                             xpu_version)
+    support_type_str_list = [
+        type_dict_paddle_to_str[x] for x in support_type_list
+    ]
+    return support_type_str_list
+
+
+def record_op_test(op_name, test_type):
+    dirname = os.getenv('XPU_OP_LIST_DIR')
+    filename = 'xpu_op_test'
+    if dirname is not None:
+        filename = os.path.join(dirname, filename)
+    with open(filename, 'a') as f:
+        fcntl.flock(f, fcntl.LOCK_EX)
+        f.write(op_name + '_' + test_type + '\n')
+
+
+def is_empty_grad_op_type(xpu_version, op, test_type):
+    xpu_op_list = core.get_xpu_device_op_list(xpu_version)
+    grad_op = op + '_grad'
+    if grad_op not in xpu_op_list.keys():
+        return True
+
+    grad_op_types = xpu_op_list[op]
+    paddle_test_type = type_dict_str_to_paddle[test_type]
+    if paddle_test_type not in grad_op_types:
+        return True
+
+    return False
+
+
+def create_test_class(func_globals,
+                      test_class,
+                      test_type,
+                      test_grad=True,
+                      ignore_deivce_version=[],
+                      test_deivce_version=[]):
+    xpu_version = core.get_xpu_device_version(0)
+    if xpu_version in ignore_deivce_version:
+        return
+
+    if len(test_deivce_version) != 0 and xpu_version not in test_deivce_version:
+        return
+
+    test_class_obj = test_class()
+    register_classes = inspect.getmembers(test_class_obj, inspect.isclass)
+    op_name = test_class_obj.op_name
+    no_grad = is_empty_grad_op_type(xpu_version, op_name, test_type)
+
+    for test_class in register_classes:
+        if test_class[0] == '__class__':
+            continue
+        class_obj = test_class[1]
+        cls_name = "{0}_{1}".format(test_class[0], str(test_type))
+        func_globals[cls_name] = type(cls_name, (class_obj, ),
+                                      {'in_type': test_type})
+
+    if hasattr(test_class_obj, 'use_dynamic_create_class'
+               ) and test_class_obj.use_dynamic_create_class:
+        base_class, dynamic_classes = test_class_obj.dynamic_create_class()
+        for dy_class in dynamic_classes:
+            cls_name = "{0}_{1}".format(dy_class[0], str(test_type))
+            attr_dict = dy_class[1]
+            attr_dict['in_type'] = test_type
+            func_globals[cls_name] = type(cls_name, (base_class, ), attr_dict)
+
+    record_op_test(op_name, test_type)
+    if not no_grad:
+        record_op_test(op_name + '_grad', test_type)
+
+
+def get_test_cover_info():
+    xpu_version = core.get_xpu_device_version(0)
+    version_str = "xpu2" if xpu_version == core.XPUVersion.XPU2 else "xpu1"
+    xpu_op_list = make_xpu_op_list(xpu_version)
+    xpu_op_covered = []
+
+    dirname = os.getenv('XPU_OP_LIST_DIR')
+    filename = 'xpu_op_test'
+    if dirname is not None:
+        filename = os.path.join(dirname, filename)
+    if os.path.exists(filename) and os.path.isfile(filename):
+        with open(filename) as f:
+            for line in f:
+                test_op_name = line.strip()
+                if test_op_name in xpu_op_list:
+                    xpu_op_covered.append(test_op_name)
+    diff_list = list(set(xpu_op_list).difference(set(xpu_op_covered)))
+    total_len = len(set(xpu_op_list))
+    covered_len = len(set(xpu_op_covered))
+    print('{} test: {}/{}'.format(version_str, covered_len, total_len))
+    if (len(diff_list) != 0):
+        print("These ops need to be tested on {0}! ops:{1}".format(
+            version_str, ','.join(diff_list)))
+
+
+if __name__ == '__main__':
+    get_test_cover_info()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
new file mode 100644
index 0000000000000..cb54d12488d54
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py
@@ -0,0 +1,297 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import compiler, Program, program_guard
+
+import op_test
+from op_test import OpTest, skip_check_grad_ci
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+# 1.动态生成不同参数的测试case，wrapper类中必须实现dynamic_create_class方法
+# self.use_dynamic_create_class置为True
+class XPUTestArgsortOp1(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'argsort'
+        self.use_dynamic_create_class = True
+
+    def dynamic_create_class(self):
+        base_class = self.TestArgsortOp
+        classes = []
+        for descending in [True, False]:
+            for axis in [0, 1, 2, -1, -2]:
+                class_name = 'XPUTestArgsortOp_axis_' + str(axis)
+                attr_dict = {'init_axis': axis, 'descending': descending}
+                classes.append([class_name, attr_dict])
+        return base_class, classes
+
+    class TestArgsortOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "argsort"
+            self.place = paddle.XPUPlace(0)
+            self.dtype = self.in_type
+            self.input_shape = (2, 2, 2, 3, 3)
+            self.axis = -1
+            self.descending = False
+
+            if self.in_type == 'float32':
+                self.x = np.random.random(self.input_shape).astype(self.dtype)
+            else:
+                self.x = np.random.randint(
+                    low=-1000, high=1000,
+                    size=self.input_shape).astype(self.dtype)
+            self.inputs = {"X": self.x}
+            self.attrs = {"axis": self.axis, "descending": self.descending}
+            self.get_output()
+            self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+        def get_output(self):
+            if self.descending:
+                self.indices = np.flip(
+                    np.argsort(
+                        self.x, kind='heapsort', axis=self.axis),
+                    self.axis)
+                self.sorted_x = np.flip(
+                    np.sort(
+                        self.x, kind='heapsort', axis=self.axis), self.axis)
+            else:
+                self.indices = np.argsort(
+                    self.x, kind='heapsort', axis=self.axis)
+                self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+# 2. 为不同参数的测试case定义一个测试类，self.use_dynamic_create_class需要置为False
+class XPUTestArgsortOp2(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'argsort'
+        self.use_dynamic_create_class = False
+
+    class TestArgsortOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = "argsort"
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+            self.init_inputshape()
+            self.init_axis()
+            self.init_direction()
+
+            if self.in_type == 'float32':
+                self.x = np.random.random(self.input_shape).astype(self.dtype)
+            else:
+                self.x = np.random.randint(
+                    low=-1000, high=1000,
+                    size=self.input_shape).astype(self.dtype)
+            self.inputs = {"X": self.x}
+            self.attrs = {"axis": self.axis, "descending": self.descending}
+            self.get_output()
+            self.outputs = {"Out": self.sorted_x, "Indices": self.indices}
+
+        def get_output(self):
+            if self.descending:
+                self.indices = np.flip(
+                    np.argsort(
+                        self.x, kind='heapsort', axis=self.axis),
+                    self.axis)
+                self.sorted_x = np.flip(
+                    np.sort(
+                        self.x, kind='heapsort', axis=self.axis), self.axis)
+            else:
+                self.indices = np.argsort(
+                    self.x, kind='heapsort', axis=self.axis)
+                self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+
+        def init_inputshape(self):
+            self.input_shape = (2, 2, 2, 3, 3)
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def init_axis(self):
+            self.axis = -1
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def init_direction(self):
+            self.descending = False
+
+    class TestArgsortOpAxis0XPU(TestArgsortOp):
+        def init_axis(self):
+            self.axis = 0
+
+    class TestArgsortOpAxis1XPU(TestArgsortOp):
+        def init_axis(self):
+            self.axis = 1
+
+    class TestArgsortOpAxis2XPU(TestArgsortOp):
+        def init_axis(self):
+            self.axis = 2
+
+    class TestArgsortOpAxisNeg1XPU(TestArgsortOp):
+        def init_axis(self):
+            self.axis = -1
+
+    class TestArgsortOpAxisNeg2XPU(TestArgsortOp):
+        def init_axis(self):
+            self.axis = -2
+
+    class TestArgsortOpDescendingAxisXPU(TestArgsortOp):
+        def init_direction(self):
+            self.descending = True
+
+    class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU):
+        def init_direction(self):
+            self.descending = True
+
+    class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU):
+        def init_direction(self):
+            self.descending = True
+
+    class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU):
+        def init_direction(self):
+            self.descending = True
+
+    class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU):
+        def init_direction(self):
+            self.descending = True
+
+    class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU):
+        def init_direction(self):
+            self.descending = True
+
+
+support_types = get_xpu_op_support_types('argsort')
+for stype in support_types:
+    create_test_class(globals(), XPUTestArgsortOp1, stype)
+    create_test_class(globals(), XPUTestArgsortOp2, stype)
+
+
+class XPUTestHuberLossOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'huber_loss'
+        self.use_dynamic_create_class = False
+
+    class TestHuberLossOp(XPUOpTest):
+        def setUp(self):
+            self.set_xpu()
+            self.op_type = 'huber_loss'
+            self.place = paddle.XPUPlace(0)
+
+            self.init_dtype()
+
+            self.set_inputs()
+            self.set_attrs()
+            self.set_outputs()
+
+        def set_inputs(self):
+            shape = self.set_shape()
+            x = np.random.uniform(0, 1., shape).astype(self.dtype)
+            y = np.random.uniform(0, 1., shape).astype(self.dtype)
+            self.inputs = {
+                'X': OpTest.np_dtype_to_fluid_dtype(x),
+                'Y': OpTest.np_dtype_to_fluid_dtype(y)
+            }
+
+        def set_attrs(self):
+            self.attrs = {'delta': 0.5}
+
+        def set_outputs(self):
+            delta = self.attrs['delta']
+            shape = self.set_shape()
+            residual = self.inputs['Y'] - self.inputs['X']
+            loss = np.vectorize(huber_loss_forward)(residual,
+                                                    delta).astype(self.dtype)
+            self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+
+        def set_shape(self):
+            return (100, 1)
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad_normal(self):
+            self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+        def test_check_grad_ingore_x(self):
+            self.check_grad_with_place(
+                self.place, ['Y'], 'Out', no_grad_set=set("residual"))
+
+        def test_check_grad_ingore_y(self):
+            self.check_grad_with_place(
+                self.place, ['X'], 'Out', no_grad_set=set('residual'))
+
+    class TestHuberLossOp1(TestHuberLossOp):
+        def set_shape(self):
+            return (640)
+
+    class TestHuberLossOp2(TestHuberLossOp):
+        def set_shape(self):
+            return (10, 10)
+
+    class TestHuberLossOp3(TestHuberLossOp):
+        def set_shape(self):
+            return (10, 10, 1)
+
+
+support_types = get_xpu_op_support_types('huber_loss')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHuberLossOp, stype)
+    create_test_class(
+        globals(),
+        XPUTestHuberLossOp,
+        stype,
+        ignore_deivce_version=[core.XPUVersion.XPU1])
+
+if __name__ == '__main__':
+    unittest.main()

From fdab43b56692c93a5a732108cca66638796ed66f Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Fri, 21 Jan 2022 15:25:22 +0800
Subject: [PATCH 08/15] [MLU]add mlu ci dockerfile (#39021)

* [MLU]add mlu ci dockerfile

* fix comment

* add cncl
---
 paddle/fluid/framework/tensor_util.h      |  2 +-
 paddle/fluid/memory/memcpy.cc             | 17 ++++++
 paddle/fluid/operators/mean_op_mlu.cc     |  8 +--
 paddle/fluid/operators/mlu/CMakeLists.txt |  2 +-
 tools/dockerfile/Dockerfile.mlu           | 73 +++++++++++++++++++++++
 5 files changed, 94 insertions(+), 8 deletions(-)
 create mode 100644 tools/dockerfile/Dockerfile.mlu

diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 3c62f3c5e43d7..f0c41e6dc0fcf 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -235,7 +235,7 @@ void TensorFromVector(const std::vector<T>& src,
   }
 #endif
 #ifdef PADDLE_WITH_MLU
-  if (platform::is_mlu_place(dst_place)) {
+  else if (platform::is_mlu_place(dst_place)) {  // NOLINT
     memory::Copy(
         dst_place, dst_ptr, src_place, src_ptr, size,
         reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index f804c2af53916..6d348ceb87c83 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -962,6 +962,23 @@ void Copy<pten::Place, pten::MLUPlace>(pten::Place dst_place, void* dst,
        stream);
 }
 
+// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
+template <>
+void Copy<pten::CPUPlace, pten::Place>(pten::CPUPlace dst_place, void* dst,
+                                       pten::Place src_place, const void* src,
+                                       size_t num, mluStream stream) {
+  Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream);
+}
+
+// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
+template <>
+void Copy<pten::Place, pten::CPUPlace>(pten::Place dst_place, void* dst,
+                                       pten::CPUPlace src_place,
+                                       const void* src, size_t num,
+                                       mluStream stream) {
+  Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream);
+}
+
 #endif  // PADDLE_WITH_MLU
 
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc
index 9862c2bd95256..ca4f3dcc3f465 100644
--- a/paddle/fluid/operators/mean_op_mlu.cc
+++ b/paddle/fluid/operators/mean_op_mlu.cc
@@ -35,9 +35,7 @@ class MeanMLUKernel : public framework::OpKernel<T> {
     auto stream = context.template device_context<MLUDeviceContext>().stream();
 
     if (rank == 0) {  // scalar
-      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
-      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
-                   stream);
+      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
       return;
     }
 
@@ -85,9 +83,7 @@ class MeanMLUGradKernel : public framework::OpKernel<T> {
     auto stream = context.template device_context<MLUDeviceContext>().stream();
 
     if (rank == 0) {  // scalar
-      auto mlu_place = BOOST_GET(platform::MLUPlace, place);
-      memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T),
-                   stream);
+      memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream);
       return;
     }
 
diff --git a/paddle/fluid/operators/mlu/CMakeLists.txt b/paddle/fluid/operators/mlu/CMakeLists.txt
index 3fc411d6d13fa..59fab48b271d5 100644
--- a/paddle/fluid/operators/mlu/CMakeLists.txt
+++ b/paddle/fluid/operators/mlu/CMakeLists.txt
@@ -1,5 +1,5 @@
 
 IF(WITH_MLU)
-    cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib)
+    cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib framework_proto xxhash)
     cc_test(activation_op_mlu_test SRCS activation_op_mlu_test.cc DEPS op_registry activation_op scope device_context executor)
 ENDIF()
diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu
new file mode 100644
index 0000000000000..f7823738afc53
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -0,0 +1,73 @@
+# A image for building paddle binaries
+# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
+#
+# Build:
+# - CNTOOLKIT_VERSION 2.6.5-1
+# - CNNL_VERSION 1.8.3-1
+# - CNCL_VERSION 1.0.2-1
+#
+# Download three packages from FTP (need to connect cambricon AE to get FTP url)
+# - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb
+# - cnnl_1.8.3-1.ubuntu18.04_amd64.deb
+# - cncl_1.0.2-1.ubuntu18.04_amd64.deb
+# copy them to current directory first, then run build commands
+#
+# For example:
+#
+# cd Paddle/tools/dockerfile
+#
+# (get cntoolkit pkg)
+# (get cnnl pkg)
+# (get cncl pkg)
+#
+# docker build -f Dockerfile.mlu  \
+# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \
+# --build-arg CNNL_VERSION=1.8.3-1 \
+# --build-arg CNCL_VERSION=1.0.2-1 \
+# -t paddlepaddle/paddle:latest-dev-mlu .
+#
+# without mlu device:
+# docker run -it --network=host --pids-limit 409600 \
+# paddlepaddle/paddle:latest-dev-mlu /bin/bash
+#
+# with mlu device:
+# docker run -it --network=host --pids-limit 409600 \
+# --device=/dev/cambricon_ctl --device=/dev/cambricon_dev0 \
+# paddlepaddle/paddle:latest-dev-mlu /bin/bash
+
+FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ENV WITH_GPU=OFF
+
+ARG CNTOOLKIT_VERSION=2.6.5-1
+ARG CNNL_VERSION=1.8.3-1
+ARG CNCL_VERSION=1.0.2-1
+ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
+ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
+ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
+
+# install cntoolkit
+COPY $CNTOOLKIT_PKG ./
+RUN dpkg -i $CNTOOLKIT_PKG && \
+    apt-get update && \
+    apt-get install -y cnrt cnperf cnpapi cnlicense cngdb cndrv cndev cncodec cncc cnas cnbin cnstudio cnrtc cnpx && \
+    rm -f $CNTOOLKIT_PKG
+
+ENV NEUWARE_HOME=/usr/local/neuware
+ENV LD_LIBRARY_PATH=$NEUWARE_HOME/lib64:$LD_LIBRARY_PATH
+
+# install cnnl
+COPY $CNNL_PKG ./
+RUN dpkg -i $CNNL_PKG && \
+    rm -f $CNNL_PKG
+
+# install cncl
+COPY $CNCL_PKG ./
+RUN dpkg -i $CNCL_PKG && \
+    rm -f $CNCL_PKG
+
+# Clean
+RUN apt-get clean -y
+
+EXPOSE 22

From b47fb7648c84721808fe7452be96d7a92b98c648 Mon Sep 17 00:00:00 2001
From: TeslaZhao <zhaolisoftware@163.com>
Date: Fri, 21 Jan 2022 15:37:27 +0800
Subject: [PATCH 09/15] Keep strided_slice op behavior consistent with slice op
 when starts input is less than -rank (#39066)

---
 paddle/fluid/operators/strided_slice_op.h              |  6 +-----
 .../fluid/tests/unittests/test_strided_slice_op.py     | 10 ++++++++++
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
index eaef9496a92dc..47714ebb806e9 100644
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
@@ -121,6 +121,7 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends,
     // stride must not be zero
     if (starts[axis_index] < 0) {
       starts[axis_index] = starts[axis_index] + axis_size;
+      starts[axis_index] = std::max<int64_t>(starts[axis_index], 0);
     }
     if (ends[axis_index] < 0) {
       if (!(ends[axis_index] == -1 &&
@@ -139,11 +140,6 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends,
       }
     }
 
-    if ((starts[axis_index] < 0) && (axis_size > 0)) {
-      starts[axis_index] += axis_size;
-      starts[axis_index] = std::max<int64_t>(starts[axis_index], 0);
-    }
-
     if (strides[axis_index] < 0) {
       reverse_axis[axis_index] = 1;
       strides[axis_index] = -strides[axis_index];
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 9d89c7cbe1397..e9be6b338fb86 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -216,6 +216,16 @@ def initTestCase(self):
         self.infer_flags = [1, 1, 1, 1, 1]
 
 
+class TestStrideSliceOp14(TestStrideSliceOp):
+    def initTestCase(self):
+        self.input = np.random.rand(4, 4, 4, 4)
+        self.axes = [1, 2, 3]
+        self.starts = [-5, 0, -7]
+        self.ends = [-1, 2, 4]
+        self.strides = [1, 1, 1]
+        self.infer_flags = [1, 1, 1]
+
+
 class TestStrideSliceOpBool(TestStrideSliceOp):
     def test_check_grad(self):
         pass

From df5152551d933487c7e9f0edd47c7066f2c95f86 Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Fri, 21 Jan 2022 15:38:02 +0800
Subject: [PATCH 10/15] modify DivideFunctor to match ElementwiseSameDims
 template (#39041)

---
 paddle/fluid/operators/mean_op.cu      |  3 +--
 paddle/pten/kernels/gpu/math_kernel.cu | 15 ---------------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 5a0afb68d63f1..63b5b871aabb5 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -59,8 +59,7 @@ class MeanCUDAKernel : public framework::OpKernel<T> {
       return;
     }
 
-    using MT = typename details::MPTypeTrait<T>::Type;
-    using Div = kernel_primitives::DivideFunctor<T, MT>;
+    using Div = kernel_primitives::DivideFunctor<T, T>;
     std::vector<int> reduce_dims;
     reduce_dims.reserve(rank);
     for (decltype(rank) i = 0; i < rank; ++i) {
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index 80931db56c3de..d7a16ac49b1c9 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -52,21 +52,6 @@ namespace pten {
         dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
   }
 
-/**
- * Util Functors
- */
-
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
 /**
  * Kernels
  */

From 814e5ab4e837a1c1270f67c6ca491da68b281a11 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Fri, 21 Jan 2022 16:36:50 +0800
Subject: [PATCH 11/15] Renamed selected_rows.* -> selected_rows_utils.*
 (#39037)

---
 paddle/fluid/distributed/service/brpc_utils.h      |  2 +-
 .../distributed/table/depends/large_scale_kv.h     |  2 +-
 paddle/fluid/eager/legacy/tensor_helper.cc         |  2 +-
 paddle/fluid/framework/CMakeLists.txt              | 14 +++++++-------
 paddle/fluid/framework/data_transform.h            |  2 +-
 paddle/fluid/framework/data_type_transform.cc      |  2 +-
 paddle/fluid/framework/details/CMakeLists.txt      |  6 +++---
 .../fluid/framework/details/broadcast_op_handle.h  |  2 +-
 .../framework/details/fused_broadcast_op_handle.h  |  2 +-
 paddle/fluid/framework/details/gather_op_handle.h  |  2 +-
 paddle/fluid/framework/details/reduce_and_gather.h |  2 +-
 paddle/fluid/framework/details/reduce_op_handle.h  |  2 +-
 paddle/fluid/framework/details/variable_visitor.cc |  2 +-
 paddle/fluid/framework/operator.h                  |  2 +-
 paddle/fluid/framework/pten_utils.cc               |  2 +-
 paddle/fluid/framework/pten_utils_test.cc          |  2 +-
 .../{selected_rows.cc => selected_rows_utils.cc}   |  2 +-
 .../{selected_rows.h => selected_rows_utils.h}     |  0
 ...ed_rows_test.cc => selected_rows_utils_test.cc} |  2 +-
 paddle/fluid/framework/var_type.h                  |  2 +-
 paddle/fluid/framework/var_type_traits.cc          |  2 +-
 paddle/fluid/framework/var_type_traits_test.cc     |  2 +-
 paddle/fluid/framework/variable.h                  |  2 +-
 paddle/fluid/framework/variable_helper.cc          |  2 +-
 paddle/fluid/imperative/CMakeLists.txt             | 10 +++++-----
 paddle/fluid/imperative/all_reduce.cc              |  2 +-
 paddle/fluid/imperative/gloo_context.h             |  2 +-
 paddle/fluid/imperative/gradient_accumulator.cc    |  2 +-
 paddle/fluid/imperative/tests/CMakeLists.txt       |  2 +-
 paddle/fluid/operators/CMakeLists.txt              |  2 +-
 paddle/fluid/operators/clip_by_norm_op.h           |  2 +-
 .../operators/fused/fused_embedding_seq_pool_op.h  |  2 +-
 paddle/fluid/operators/lookup_table_dequant_op.h   |  2 +-
 paddle/fluid/operators/lookup_table_op.h           |  2 +-
 paddle/fluid/operators/lookup_table_v2_op.h        |  2 +-
 paddle/fluid/operators/math/CMakeLists.txt         |  4 ++--
 paddle/fluid/operators/math/matrix_bit_code.h      |  2 +-
 .../fluid/operators/math/selected_rows_functor.h   |  2 +-
 paddle/fluid/operators/nce_op.h                    |  2 +-
 paddle/fluid/operators/optimizers/sgd_op.h         |  2 +-
 paddle/fluid/operators/save_op.h                   |  2 +-
 paddle/fluid/pybind/io.cc                          |  2 +-
 paddle/fluid/pybind/pybind.cc                      |  2 +-
 paddle/pten/api/lib/utils/CMakeLists.txt           |  2 +-
 44 files changed, 56 insertions(+), 56 deletions(-)
 rename paddle/fluid/framework/{selected_rows.cc => selected_rows_utils.cc} (99%)
 rename paddle/fluid/framework/{selected_rows.h => selected_rows_utils.h} (100%)
 rename paddle/fluid/framework/{selected_rows_test.cc => selected_rows_utils_test.cc} (99%)

diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/service/brpc_utils.h
index 47de71d2087e9..556bbb1048e2c 100644
--- a/paddle/fluid/distributed/service/brpc_utils.h
+++ b/paddle/fluid/distributed/service/brpc_utils.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/port.h"
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index ac11183d192ff..3b00f1d6ccc3a 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -33,7 +33,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/rw_lock.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/framework/variable.h"
diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc
index 97cac5a340419..2ee2f9fefa9a3 100644
--- a/paddle/fluid/eager/legacy/tensor_helper.cc
+++ b/paddle/fluid/eager/legacy/tensor_helper.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 83e5c1c17925e..e4fe35b9b5c5a 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -110,7 +110,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader)
 cc_library(threadpool SRCS threadpool.cc DEPS enforce)
 cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
 
-cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope)
+cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows_utils framework_proto scope)
 if (WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -164,7 +164,7 @@ cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_
 cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
 
 cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
-        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
+        framework_proto selected_rows_utils data_device_transform data_type_transform data_layout_transform)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce)
 cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
@@ -382,8 +382,8 @@ cc_library(prune SRCS prune.cc DEPS framework_proto boost)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
-cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
-cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS tensor)
+cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
@@ -406,7 +406,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits pten_api_utils op_info)
+cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info)
 
 # Get the current working branch
 execute_process(
@@ -438,8 +438,8 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
-if(WITH_TESTING AND TEST selected_rows_test)
-  set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
+if(WITH_TESTING AND TEST selected_rows_utils_test)
+  set_tests_properties(selected_rows_utils_test PROPERTIES TIMEOUT 120)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index f8b36b48c308e..385a5ff704f51 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_kernel_type.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index faff846cf2a60..5b6aedd2fe14b 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type_transform.h"
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 87f77ec2fff3a..66dfb81755f1c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -12,7 +12,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place
 cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry)
 cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper)
 
-cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)
+cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows_utils)
 
 if(WITH_PSCORE)
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -88,7 +88,7 @@ endif()
 
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 
-cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows_utils reference_count_pass_helper)
 
 set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
     multi_devices_helper
@@ -114,7 +114,7 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha
 cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
         device_context gather_op_handle)
 
-cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows)
+cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows_utils)
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 055c7e63863b3..8453da3c79066 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index e08a768f8ce07..6ba6df7011ade 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h
index 9cbd94cd6b877..575b7ca083d94 100644
--- a/paddle/fluid/framework/details/gather_op_handle.h
+++ b/paddle/fluid/framework/details/gather_op_handle.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h
index 9ecb2d8dbdd1c..583c34494bca4 100644
--- a/paddle/fluid/framework/details/reduce_and_gather.h
+++ b/paddle/fluid/framework/details/reduce_and_gather.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 namespace paddle {
 namespace framework {
 namespace details {
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 4b9f289eaa787..e9c913b0c8255 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc
index be1371542f530..4315b6b0fc245 100644
--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/details/variable_visitor.h"
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 
 namespace pten {
 class DenseTensor;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index ad84dbc9be6d2..8e000ef9985bd 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/memory/malloc.h"
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 4e33e641cf1fc..2fd5b87b7f3fd 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/string/string_helper.h"
 
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
index ab2d60a34303a..004345fa1e571 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows_utils.cc
similarity index 99%
rename from paddle/fluid/framework/selected_rows.cc
rename to paddle/fluid/framework/selected_rows_utils.cc
index 6cad0915be736..c33ee655c2a98 100644
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows_utils.h
similarity index 100%
rename from paddle/fluid/framework/selected_rows.h
rename to paddle/fluid/framework/selected_rows_utils.h
diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc
similarity index 99%
rename from paddle/fluid/framework/selected_rows_test.cc
rename to paddle/fluid/framework/selected_rows_utils_test.cc
index 3b0509e0344ef..7a9f86041d996 100644
--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_utils_test.cc
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index 2e35f9b845ac7..5747df57c4568 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/framework/variable.h"
 
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index eb8a1e4cea9fb..401ccb03d78d6 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #include "paddle/fluid/platform/macros.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc
index 9a9b90cd81179..812a34112a465 100644
--- a/paddle/fluid/framework/var_type_traits_test.cc
+++ b/paddle/fluid/framework/var_type_traits_test.cc
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 2fa48150903ad..188b00d818de3 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -18,7 +18,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 37ec5d7bc83bd..34ab07def54c1 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 594b0d48a8aad..d0f8d39f927f6 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,8 +1,8 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils)
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
@@ -13,7 +13,7 @@ cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradien
 cc_library(imperative_profiler SRCS profiler.cc DEPS flags)
 if(NOT WIN32)
     if(WITH_NCCL OR WITH_RCCL)
-        cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
+        cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows_utils tensor)
         cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits)
         if(WITH_NCCL)
             nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
@@ -43,9 +43,9 @@ if(WITH_GLOO)
 endif()
 
 if(NOT WITH_ASCEND_CL)
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function)
 else()
-cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function npu_op_runner)
+cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner)
 endif()
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 78855cc5c9e2e..d1d6a0f5adf58 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -25,7 +25,7 @@
 #endif
 
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h
index e7c9ba4cfddb6..f13bb859eee93 100644
--- a/paddle/fluid/imperative/gloo_context.h
+++ b/paddle/fluid/imperative/gloo_context.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/parallel_context.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 2056b8622052b..092872247cca5 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -19,7 +19,7 @@
 #include <utility>
 
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 32e982f1f15ca..88f8076885e2f 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -12,7 +12,7 @@ else()
 endif(WIN32)
 
 
-cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator math_function)
+cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function)
 cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy)
 cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place)
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 985f870ded4e7..f8a27da00ba2b 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -156,7 +156,7 @@ endif()
 
 cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lapack_function
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
 sequence_pooling segment_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index fb21d9fec90ca..adb2a2fcfa3a7 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
index 8713d58034241..4e4322947a857 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 
diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h
index af99c6e98c5ad..70aad1d3238f2 100644
--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/math/blas.h"
 
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index 74e26626bd528..a89d5fb7cb6e5 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h
index 4e8d96afa03c4..54564395c6d04 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index fcf988efcd34c..65bf595bcebb8 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -66,9 +66,9 @@ math_library(maxouting)
 math_library(pooling)
 
 if(WITH_MKLDNN)
-    math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler)
 else()
-    math_library(selected_rows_functor DEPS selected_rows math_function blas)
+    math_library(selected_rows_functor DEPS selected_rows_utils math_function blas)
 endif()
 
 math_library(sequence2batch)
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 22e5256335c73..71d905214ab9f 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index a1eb69db7cfce..8ba7851d7b979 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h
index 364a0f02e3ab7..55f684b66485b 100644
--- a/paddle/fluid/operators/nce_op.h
+++ b/paddle/fluid/operators/nce_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/sampler.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index a1fb3debb48e6..9d98e745a01ae 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/platform/bfloat16.h"
diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h
index e44a5c77bd841..5ed71a26c8aa3 100644
--- a/paddle/fluid/operators/save_op.h
+++ b/paddle/fluid/operators/save_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc
index fc49f76305461..88a43f9428b22 100644
--- a/paddle/fluid/pybind/io.cc
+++ b/paddle/fluid/pybind/io.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/pybind/pybind_boost_headers.h"
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cd999f17f3a2f..cdbfa11abec72 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -54,7 +54,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/save_load_util.h"
 #include "paddle/fluid/framework/scope_pool.h"
-#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/trainer.h"
 #include "paddle/fluid/framework/type_defs.h"
diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
index a4db8c4b193b6..74ecb3cd65262 100644
--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits)

From 06803c29a387e52756ddeffb8e97e25062e237aa Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 21 Jan 2022 16:37:55 +0800
Subject: [PATCH 12/15] [pten] add concat pten kernel (#38955)

---
 paddle/fluid/framework/CMakeLists.txt         |   2 +-
 paddle/fluid/framework/lod_tensor.cc          |  42 +-
 paddle/fluid/framework/lod_tensor.h           |  14 -
 paddle/fluid/framework/lod_tensor_test.cc     |   5 +-
 paddle/fluid/framework/operator.cc            |   4 +
 paddle/fluid/imperative/prepared_operator.cc  |   4 +
 .../fluid/operators/array_to_lod_tensor_op.cc |   3 +-
 paddle/fluid/operators/concat_op.cc           |  15 +-
 paddle/fluid/operators/concat_op.h            | 111 +---
 paddle/fluid/operators/concat_op_xpu.cc       |   6 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |   3 +-
 .../fluid/operators/math/concat_and_split.cc  |  81 +--
 .../fluid/operators/math/concat_and_split.cu  | 435 +------------
 paddle/fluid/operators/merge_lod_tensor_op.cc |   4 +-
 .../fluid/operators/shrink_rnn_memory_op.cc   |   4 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |   3 +-
 paddle/fluid/pybind/pybind.cc                 |   4 +-
 paddle/pten/CMakeLists.txt                    |   2 +-
 paddle/pten/api/include/kernel_signature.h    |   5 +
 paddle/pten/api/lib/utils/tensor_utils.cc     |   9 +-
 paddle/pten/core/CMakeLists.txt               |   1 +
 paddle/pten/core/kernel_context.h             |   2 +-
 paddle/pten/core/lod_utils.cc                 |  59 ++
 paddle/pten/core/lod_utils.h                  |  37 ++
 paddle/pten/infermeta/multiary.cc             |  41 +-
 paddle/pten/infermeta/multiary.h              |  11 +-
 paddle/pten/kernels/CMakeLists.txt            |   2 +-
 paddle/pten/kernels/concat_kernel.h           |  43 ++
 paddle/pten/kernels/cpu/concat_and_split.h    | 138 +++++
 paddle/pten/kernels/cpu/concat_kernel.cc      | 125 ++++
 paddle/pten/kernels/funcs/concat_funcs.h      |  95 +++
 paddle/pten/kernels/gpu/concat_and_split.h    | 569 ++++++++++++++++++
 paddle/pten/kernels/gpu/concat_kernel.cu      | 125 ++++
 paddle/pten/tests/api/CMakeLists.txt          |   1 +
 paddle/pten/tests/api/test_concat_api.cc      |  86 +++
 paddle/pten/tests/kernels/CMakeLists.txt      |   1 +
 .../pten/tests/kernels/test_concat_dev_api.cc |  82 +++
 python/paddle/utils/code_gen/api.yaml         |  10 +
 python/paddle/utils/code_gen/api_gen.py       |  39 +-
 39 files changed, 1552 insertions(+), 671 deletions(-)
 create mode 100644 paddle/pten/core/lod_utils.cc
 create mode 100644 paddle/pten/core/lod_utils.h
 create mode 100644 paddle/pten/kernels/concat_kernel.h
 create mode 100644 paddle/pten/kernels/cpu/concat_and_split.h
 create mode 100644 paddle/pten/kernels/cpu/concat_kernel.cc
 create mode 100644 paddle/pten/kernels/funcs/concat_funcs.h
 create mode 100644 paddle/pten/kernels/gpu/concat_and_split.h
 create mode 100644 paddle/pten/kernels/gpu/concat_kernel.cu
 create mode 100644 paddle/pten/tests/api/test_concat_api.cc
 create mode 100644 paddle/pten/tests/kernels/test_concat_dev_api.cc

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e4fe35b9b5c5a..286a8684127a9 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -94,7 +94,7 @@ else()
 endif()
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version)
 
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_utils lod_tensor memory)
 
 if(WITH_GPU)
   nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index a4b9fff8ecd15..ab2e30a15ea15 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -117,7 +117,8 @@ bool CheckLoD(const LoD &in, int tensor_height) {
   }
   // check: the lowest level's last offset should equals `tensor_height` if
   //        tensor_height>0.
-  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+  if (tensor_height > 0 &&
+      static_cast<size_t>(tensor_height) != in.back().back())
     return false;
 
   // check: the higher level's last offset should equals the lower level's
@@ -150,7 +151,7 @@ bool CheckAbsLoD(const LoD &in, int tensor_height) {
     if (level.front() != 0) return false;
     if (tensor_height < 0) {
       tensor_height = level.back();
-    } else if ((size_t)tensor_height != level.back()) {
+    } else if (static_cast<size_t>(tensor_height) != level.back()) {
       return false;
     }
   }
@@ -186,27 +187,6 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD *lod, const LoD &lod_length) {
-  PADDLE_ENFORCE(
-      lod->empty() || lod->size() == lod_length.size(),
-      platform::errors::InvalidArgument(
-          "The input LoD length should be equal to the appended LoD size, but "
-          "received input LoD length is %d, actual LoD size is %d.",
-          lod_length, lod->size()));
-  if (lod->empty()) {
-    for (size_t i = 0; i < lod_length.size(); ++i) {
-      lod->emplace_back(1, 0);  // size = 1, value = 0;
-    }
-    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
-  }
-  for (size_t i = 0; i < lod->size(); ++i) {
-    auto &level = (*lod)[i];
-    for (size_t len : lod_length[i]) {
-      level.push_back(level.back() + len);
-    }
-  }
-}
-
 void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
                        const platform::DeviceContext &dev_ctx) {
   {  // the 1st field, uint32_t version for LoDTensor
@@ -313,22 +293,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
-  LoD length_lod;
-  length_lod.reserve(offset_lod.size());
-  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
-    std::vector<size_t> level;
-    if (offset_lod[lvl].size() > 0) {
-      level.reserve(offset_lod[lvl].size() - 1);
-    }
-    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
-      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
-    }
-    length_lod.push_back(level);
-  }
-  return length_lod;
-}
-
 LoD ConvertToOffsetBasedLoD(const LoD &length_lod) {
   LoD offset_lod;
   offset_lod.reserve(length_lod.size());
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 14727c190b581..63680c008bf66 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -157,8 +157,6 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
 std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
     const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
 
-void AppendLoD(LoD* lod, const LoD& lod_length);
-
 /*
  * Serialize/Desiralize LoDTensor to std::ostream
  * You can pass ofstream or ostringstream to serilize to file
@@ -173,18 +171,6 @@ void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const size_t& seek,
                            const std::vector<int64_t>& shape);
 
-/*
- * Convert between length-based LoD and offset-based LoD.
- * The implementation of LoDTensor class use offset-based LoD.
- * However, we want to expose the more user-friendly length-based
- * LoD to the Python side instead.
- *
- * Example:
- * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
- * then length_lod = [[2, 1], [3, 2, 4]]
- */
-LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
-
 LoD ConvertToOffsetBasedLoD(const LoD& length_lod);
 
 void SerializeToStream(std::ostream& os, const LoDTensor& tensor);
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index 917bb7cc096c2..5e72c2d3d7e94 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/pten/core/lod_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -98,7 +99,7 @@ TEST(LoD, AppendLoD) {
   origin.push_back(std::vector<size_t>({0, 1, 6}));
   origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
 
-  paddle::framework::AppendLoD(&origin, lod_lens);
+  pten::AppendLoD(&origin, lod_lens);
 
   LoD expected;
   expected.push_back(std::vector<size_t>({0, 2, 4}));
@@ -277,7 +278,7 @@ TEST(LoD, ConvertToLengthBasedLoD) {
   offset_lod.push_back(std::vector<size_t>({0, 1, 3}));
   offset_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
 
-  LoD length_lod = ConvertToLengthBasedLoD(offset_lod);
+  LoD length_lod = pten::ConvertToLengthBasedLoD(offset_lod);
 
   LoD expected;
   expected.push_back(std::vector<size_t>({2}));
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index e69a6c2e88c6b..33a4e5d2f3906 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1978,6 +1978,10 @@ void OperatorWithKernel::BuildPtenKernelContext(
                    std::type_index(typeid(std::string))) {
           pt_kernel_context->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          pt_kernel_context->EmplaceBackAttr(
+              std::move(pten::Scalar(BOOST_GET_CONST(int, attr))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index d28595a6a4c75..fe60f05e1da43 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -438,6 +438,10 @@ static void BuildDygraphPtenKernelContext(
                    std::type_index(typeid(std::string))) {
           kernel_ctx->EmplaceBackAttr(
               std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          kernel_ctx->EmplaceBackAttr(
+              std::move(pten::Scalar(BOOST_GET_CONST(int, attr))));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 1680ad528abf9..a959067ddba62 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include <paddle/fluid/operators/math/concat_and_split.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/core/lod_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -168,7 +169,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
             x[x_idx].lod(), idx, idx + 1, 0);
 
         auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(out_lod, lod_length);
+        pten::AppendLoD(out_lod, lod_length);
 
         size_t start_offset = lod_and_offset.second.first;
         size_t end_offset = lod_and_offset.second.second;
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index e6b1f6a1c18c3..9eba127a9b3ce 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -19,6 +19,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/pten/kernels/funcs/concat_funcs.h"
+
 #ifdef PADDLE_WITH_MKLDNN
 #include <paddle/fluid/platform/mkldnn_helper.h>
 #endif
@@ -56,8 +58,8 @@ class ConcatOp : public framework::OperatorWithKernel {
       size_t axis =
           ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
                       static_cast<int64_t>(inputs_dims[0].size()));
-      framework::DDim out_dims =
-          ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
+      framework::DDim out_dims = pten::funcs::ComputeAndCheckShape(
+          ctx->IsRuntime(), inputs_dims, axis);
       if (out_dims[axis] < 0) {
         out_dims[axis] = -1;
       }
@@ -102,6 +104,15 @@ class ConcatOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (ctx.HasInput("AxisTensor")) {
+      return framework::KernelSignature("concat", {"X"}, {"AxisTensor"},
+                                        {"Out"});
+    }
+    return framework::KernelSignature("concat", {"X"}, {"axis"}, {"Out"});
+  }
 };
 
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index bb72174be5ed5..3eaffbdc8bf35 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -22,54 +22,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
 
+#include "paddle/pten/kernels/concat_kernel.h"
+#include "paddle/pten/kernels/funcs/concat_funcs.h"
+
 namespace paddle {
 namespace operators {
-static inline framework::DDim ComputeAndCheckShape(
-    const bool is_runtime, const std::vector<framework::DDim>& inputs_dims,
-    const size_t axis) {
-  const size_t n = inputs_dims.size();
-  auto out_dims = inputs_dims[0];
-  size_t in_zero_dims_size = out_dims.size();
-  for (size_t i = 1; i < n; i++) {
-    PADDLE_ENFORCE_EQ(inputs_dims[i].size(), out_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "The shape of input[0] and input[%d] "
-                          "is expected to be equal."
-                          "But received input[0]'s shape = "
-                          "[%s], input[%d]'s shape = [%s].",
-                          i, inputs_dims[0], i, inputs_dims[i]));
-    for (size_t j = 0; j < in_zero_dims_size; j++) {
-      if (j == axis) {
-        if (is_runtime) {
-          out_dims[axis] += inputs_dims[i][j];
-        } else {
-          if (inputs_dims[i][j] == -1 || out_dims[j] == -1) {
-            out_dims[axis] = -1;
-          } else {
-            out_dims[axis] += inputs_dims[i][j];
-          }
-        }
-      } else {
-        bool check_shape =
-            is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0);
-        if (check_shape) {
-          // check all shape in run time
-          PADDLE_ENFORCE_EQ(inputs_dims[0][j], inputs_dims[i][j],
-                            platform::errors::InvalidArgument(
-                                "The %d-th dimension of input[0] and input[%d] "
-                                "is expected to be equal."
-                                "But received input[0]'s shape = "
-                                "[%s], input[%d]'s shape = [%s].",
-                                j, i, inputs_dims[0], i, inputs_dims[i]));
-        }
-        if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) {
-          out_dims[j] = inputs_dims[i][j];
-        }
-      }
-    }
-  }
-  return out_dims;
-}
 
 static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
   PADDLE_ENFORCE_EQ(
@@ -109,67 +66,21 @@ class ConcatKernel : public framework::OpKernel<T> {
         ins_dims[i] = ins[i]->dims();
       }
 
-      framework::DDim out_dims = ComputeAndCheckShape(true, ins_dims, axis);
+      framework::DDim out_dims =
+          pten::funcs::ComputeAndCheckShape(true, ins_dims, axis);
       out->Resize(out_dims);
     }
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
-    // If axis is 0, the lod of the output is not the same as inputs.
-    if (axis == 0 && ins[0]->lod().size() > 0) {
-      size_t lod_size_0 = ins[0]->lod().size();
-      size_t lod_size = lod_size_0;
-      for (size_t i = 1; i < ins.size(); ++i) {
-        if (ins[i]->lod().size() > 0) {
-          PADDLE_ENFORCE_EQ(
-              ins[i]->lod().size(), lod_size_0,
-              platform::errors::Unimplemented(
-                  "The lod level of all input LoDTensors should be same. "
-                  "Maybe different lod level of input LoDTensors can concat,"
-                  "it is not supported currently. The lod level of %dth input "
-                  "is %d and first input is %d.",
-                  i, ins[i]->lod().size(), lod_size_0));
-        } else {
-          lod_size = 0;
-          break;
-        }
-      }
-      if (lod_size) {
-        auto* out_lod = out->mutable_lod();
-        for (size_t i = 1; i < ins.size(); ++i) {
-          auto in_lod = ConvertToLengthBasedLoD(ins[i]->lod());
-          AppendLoD(out_lod, in_lod);
-        }
-      }
+    // call new kernel
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    std::vector<pten::DenseTensor> pt_ins;
+    for (auto& in : ins) {
+      pt_ins.push_back(*in);
     }
 
-    // Sometimes direct copies will be faster, this maybe need deeply analysis.
-    if (axis == 0 && ins.size() < 10) {
-      size_t output_offset = 0;
-      for (auto* in : ins) {
-        if (!in || in->numel() == 0UL) {
-          continue;
-        }
-        auto in_stride = framework::stride_numel(in->dims());
-        auto out_stride = framework::stride_numel(out->dims());
-        StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
-                                    out->data<T>() + output_offset, out_stride,
-                                    in->data<T>(), in_stride, in_stride[axis]);
-        output_offset += in_stride[axis];
-      }
-    } else {
-      std::vector<framework::Tensor> inputs;
-      for (size_t j = 0; j < ins.size(); ++j) {
-        if (ins[j] && ins[j]->numel() > 0) {
-          inputs.push_back(*ins[j]);
-        } else {
-          continue;
-        }
-      }
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
-      paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
-      concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
-    }
+    pten::ConcatKernel<T>(dev_ctx, pt_ins, axis, out);
   }
 };
 
diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc
index 0ff11e11165f0..aa10a58738bbd 100644
--- a/paddle/fluid/operators/concat_op_xpu.cc
+++ b/paddle/fluid/operators/concat_op_xpu.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
+#include "paddle/pten/core/lod_utils.h"
+
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
@@ -69,8 +71,8 @@ class ConcatXPUKernel : public framework::OpKernel<T> {
       if (lod_size) {
         auto* out_lod = out->mutable_lod();
         for (size_t i = 1; i < ins.size(); ++i) {
-          auto in_lod = ConvertToLengthBasedLoD(ins[i]->lod());
-          AppendLoD(out_lod, in_lod);
+          auto in_lod = pten::ConvertToLengthBasedLoD(ins[i]->lod());
+          pten::AppendLoD(out_lod, in_lod);
         }
       }
     }
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index e02972bd75353..5f39a9afa94ba 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/core/lod_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -134,7 +135,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
         auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
             x.lod(), start_idx, start_idx + 1, rank_level + 1);
         auto &lod_length = lod_and_offset.first;
-        framework::AppendLoD(&lod, lod_length);
+        pten::AppendLoD(&lod, lod_length);
         size_t start_offset = lod_and_offset.second.first;
         size_t end_offset = lod_and_offset.second.second;
         copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 4f12630d1e02f..a9f2680660bd2 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
+
+#include "paddle/pten/kernels/cpu/concat_and_split.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -44,36 +46,9 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    size_t num = input.size();
-
-    int64_t rows = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      rows *= dim_0[i];
-    }
-    int64_t out_rows = rows, out_cols = 0;
-
-    std::vector<int64_t> input_cols(input.size());
-    for (size_t i = 0; i < num; ++i) {
-      int64_t t_cols = input[i].numel() / rows;
-      out_cols += t_cols;
-      input_cols[i] = t_cols;
-    }
-    auto cpu_place = context.GetPlace();
-
-    // computation
-    auto output_data = output->data<T>();
-    int64_t col_idx = 0;
-    for (size_t j = 0; j < num; ++j) {
-      int64_t col_len = input_cols[j];
-      auto input_data = input[j].data<T>();
-      for (int64_t k = 0; k < out_rows; ++k) {
-        memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
-                     input_data + k * col_len, sizeof(T) * col_len);
-      }
-      col_idx += col_len;
-    }
+    std::vector<pten::DenseTensor> pt_input{input.begin(), input.end()};
+    pten::ConcatImpl<T, platform::CPUDeviceContext>(context, pt_input, axis,
+                                                    output);
   }
 };
 
@@ -88,46 +63,12 @@ class SplitFunctor<platform::CPUDeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   const int axis, std::vector<framework::Tensor*>* outputs) {
-    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-    // tensors of shape [0,1,4]
-    if (input.numel() == 0) {
-      return;
-    }
-
-    // TODO(zcd): Add input data validity checking
-    size_t num = outputs->size();
-
-    int input_rows = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      input_rows *= dim_0[i];
-    }
-
-    int input_cols = 0;
-
-    std::vector<int64_t> output_cols(outputs->size());
-    for (size_t i = 0; i < num; ++i) {
-      int t_cols = ref_inputs[i]->numel() / input_rows;
-      input_cols += t_cols;
-      output_cols[i] = t_cols;
-    }
-    auto cpu_place = context.GetPlace();
-
-    // computation
-    for (int k = 0; k < input_rows; ++k) {
-      const T* src_ptr = input.data<T>() + k * input_cols;
-      int col_idx = 0;
-      for (size_t j = 0; j < num; ++j) {
-        int col_len = output_cols[j];
-        auto* out_tensor = outputs->at(j);
-        if (out_tensor != nullptr) {
-          T* dst_ptr = out_tensor->data<T>() + k * col_len;
-          memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
-                       sizeof(T) * col_len);
-        }
-        col_idx += col_len;
-      }
-    }
+    std::vector<const pten::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
+                                                        ref_inputs.end()};
+    std::vector<pten::DenseTensor*> pt_outputs{outputs->begin(),
+                                               outputs->end()};
+    pten::SplitImpl<T, platform::CPUDeviceContext>(
+        context, input, pt_ref_inputs, axis, &pt_outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 5b99a62d78d2a..4357a86b7e65d 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -12,218 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/fluid/platform/float16.h"
 
+#include "paddle/pten/kernels/gpu/concat_and_split.h"
 namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-__global__ void ConcatKernel(const T** inputs, const int64_t* input_cols,
-                             int col_size, const int64_t output_rows,
-                             const int64_t output_cols, T* output) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = input_cols[0];
-  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = input_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = input_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-
-    const T* input_ptr = inputs[curr_segment];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
-      output[tid_y * output_cols + tid_x] =
-          input_ptr[tid_y * segment_width + local_col];
-  }
-}
-
-template <typename T>
-__device__ void ConcatKernelDetail(const T** inputs_data,
-                                   const int fixed_in_col, const int out_rows,
-                                   const int out_cols, T* output_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x * 1.0 / fixed_in_col;
-    int in_offset = tid_x - split * fixed_in_col;
-    const T* input_ptr = inputs_data[split];
-    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
-      output_data[tid_y * out_cols + tid_x] =
-          input_ptr[tid_y * fixed_in_col + in_offset];
-    }
-  }
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const int64_t fixed_in_col, const int64_t out_rows,
-                             const int64_t out_cols, T* output_data) {
-  const T* inputs_data[2];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const int64_t fixed_in_col,
-                             const int64_t out_rows, const int64_t out_cols,
-                             T* output_data) {
-  const T* inputs_data[3];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const T* input_addr3,
-                             const int64_t fixed_in_col, const int64_t out_rows,
-                             const int64_t out_cols, T* output_data) {
-  const T* inputs_data[4];
-  inputs_data[0] = input_addr0;
-  inputs_data[1] = input_addr1;
-  inputs_data[2] = input_addr2;
-  inputs_data[3] = input_addr3;
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void ConcatKernel(const T** inputs_data, const int in_num,
-                             const int64_t fixed_in_col, const int64_t out_rows,
-                             const int64_t out_cols, T* output_data) {
-  ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
-                        output_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int64_t in_row,
-                            const int64_t in_col, const int64_t* out_cols,
-                            int out_cols_size, T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int curr_segment = 0;
-  int curr_offset = out_cols[0];
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int curr_col_offset = out_cols[curr_segment + 1];
-    while (curr_col_offset <= tid_x) {
-      curr_offset = curr_col_offset;
-      ++curr_segment;
-      curr_col_offset = out_cols[curr_segment + 1];
-    }
-
-    int local_col = tid_x - curr_offset;
-    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs_data[curr_segment];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * segment_width + local_col] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__device__ void SplitKernelDetail(const T* input_data, const int in_row,
-                                  const int in_col, const int fixed_out_col,
-                                  T** outputs_data) {
-  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
-    int split = tid_x / fixed_out_col;
-    int in_offset = tid_x - split * fixed_out_col;
-    T* output_ptr = outputs_data[split];
-    if (output_ptr != nullptr) {
-      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
-        output_ptr[tid_y * fixed_out_col + in_offset] =
-            input_data[tid_y * in_col + tid_x];
-    }
-  }
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int64_t in_row,
-                            const int64_t in_col, const int64_t fixed_out_col,
-                            T** outputs_data) {
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int64_t in_row,
-                            const int64_t in_col, const int64_t fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1) {
-  T* outputs_data[2];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int64_t in_row,
-                            const int64_t in_col, const int64_t fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2) {
-  T* outputs_data[3];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-template <typename T>
-__global__ void SplitKernel(const T* input_data, const int64_t in_row,
-                            const int64_t in_col, const int64_t fixed_out_col,
-                            T* outputs_addr0, T* outputs_addr1,
-                            T* outputs_addr2, T* outputs_addr3) {
-  T* outputs_data[4];
-  outputs_data[0] = outputs_addr0;
-  outputs_data[1] = outputs_addr1;
-  outputs_data[2] = outputs_addr2;
-  outputs_data[3] = outputs_addr3;
-  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
-}
-
-static inline void GetBlockDims(const platform::CUDADeviceContext& context,
-                                int64_t num_rows, int64_t num_cols,
-                                dim3* block_dims, dim3* grid_dims) {
-  // Set the thread block and grid according to CurrentDeviceId
-  const int kThreadsPerBlock = 1024;
-  int block_cols = kThreadsPerBlock;
-  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
-    block_cols = ((num_cols + 31) >> 5) << 5;
-  }
-  int block_rows = kThreadsPerBlock / block_cols;
-  *block_dims = dim3(block_cols, block_rows, 1);
-
-  int max_threads = context.GetMaxPhysicalThreadCount();
-  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
-
-  int grid_cols =
-      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
-  int grid_rows = std::min(max_blocks / grid_cols,
-                           std::max(num_rows / block_rows, (int64_t)1));
-  *grid_dims = dim3(grid_cols, grid_rows, 1);
-}
-
 /*
  * All tensors' dimension should be the same and the values of
  * each dimension must be the same, except the axis dimension.
@@ -234,112 +29,10 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
-    // TODO(zcd): Add input data validity checking
-    int in_num = input.size();
-    int64_t in_row = 1;
-    auto dim_0 = input[0].dims();
-    for (int i = 0; i < axis; ++i) {
-      in_row *= dim_0[i];
-    }
-    int64_t in_col = input[0].numel() / in_row;
-    int64_t out_row = in_row, out_col = 0;
-
-    int inputs_col_num = in_num + 1;
-    std::vector<const T*> inputs_data_vec(in_num);
-    std::vector<int64_t> inputs_col_vec(inputs_col_num);
-    const T** inputs_data = inputs_data_vec.data();
-    int64_t* inputs_col = inputs_col_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-    memory::AllocationPtr data_alloc, col_alloc;
-    data_alloc =
-        memory::Alloc(platform::CUDAPinnedPlace(), in_num * sizeof(T*));
-    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
-    col_alloc = memory::Alloc(platform::CUDAPinnedPlace(),
-                              inputs_col_num * sizeof(int));
-    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
-#endif
-
-    inputs_col[0] = 0;
-    bool has_same_shape = true;
-    for (int i = 0; i < in_num; ++i) {
-      int64_t t_cols = input[i].numel() / in_row;
-      if (has_same_shape) {
-        if (t_cols != in_col) has_same_shape = false;
-      }
-      out_col += t_cols;
-      inputs_col[i + 1] = out_col;
-      inputs_data[i] = input[i].data<T>();
-    }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
+    std::vector<pten::DenseTensor> pt_input{input.begin(), input.end()};
 
-    memory::allocation::AllocationPtr tmp_dev_ins_data;
-    const T** dev_ins_data = nullptr;
-    if (!has_same_shape || in_num < 2 || in_num > 4) {
-      tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
-      auto* restored =
-          platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num);
-      memory::Copy(context.GetPlace(), tmp_dev_ins_data->ptr(),
-                   platform::CPUPlace(), restored, in_num * sizeof(T*),
-                   context.stream());
-      dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
-    }
-
-    if (has_same_shape) {
-      if (in_num == 2) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], in_col, out_row, out_col,
-            output->data<T>());
-      } else if (in_num == 3) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], in_col, out_row,
-            out_col, output->data<T>());
-      } else if (in_num == 4) {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            inputs_data[0], inputs_data[1], inputs_data[2], inputs_data[3],
-            in_col, out_row, out_col, output->data<T>());
-      } else {
-        ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          memory::Alloc(context, inputs_col_num * sizeof(int64_t));
-
-      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
-          inputs_col, inputs_col_num);
-      memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(),
-                   platform::CPUPlace(), restored,
-                   inputs_col_num * sizeof(int64_t), context.stream());
-      int64_t* dev_ins_col_data =
-          static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-      ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-          dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col_num),
-          out_row, out_col, output->data<T>());
-    }
-
-#ifdef PADDLE_WITH_HIP
-    // Prevent the pinned memory value from being covered and release the memory
-    // after the launch kernel of the stream is executed (reapply pinned memory
-    // next time)
-    auto* data_alloc_released = data_alloc.release();
-    auto* col_alloc_released = col_alloc.release();
-    context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
-      memory::allocation::Allocator::AllocationDeleter(col_alloc_released);
-    });
-#endif
+    pten::ConcatImpl<T, platform::CUDADeviceContext>(context, pt_input, axis,
+                                                     output);
   }
 };
 
@@ -355,120 +48,12 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const std::vector<const framework::Tensor*>& ref_inputs,
                   int axis, std::vector<framework::Tensor*>* outputs) {
-    // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
-    // tensors of shape [0,1,4]
-    if (input.numel() == 0) {
-      return;
-    }
-
-    // TODO(zcd): Add input data validity checking
-    int o_num = outputs->size();
-    int64_t out_row = 1;
-    auto dim_0 = ref_inputs[0]->dims();
-    for (int i = 0; i < axis; ++i) {
-      out_row *= dim_0[i];
-    }
-
-    int64_t out0_col = ref_inputs[0]->numel() / out_row;
-    int64_t in_col = 0, in_row = out_row;
-    bool has_same_shape = true;
-
-    int outputs_cols_num = o_num + 1;
-    std::vector<T*> outputs_data_vec(o_num);
-    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-    T** outputs_data = outputs_data_vec.data();
-    int64_t* outputs_cols = outputs_cols_vec.data();
-
-// There are some differences between hip runtime and NV runtime.
-// In NV, when the pageable memory data less than 64K is transferred from
-// hosttodevice, it will be automatically asynchronous.
-// However, only pinned memory in hip can copy asynchronously
-// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
-// 3.2.6.1. Concurrent Execution between Host and Device
-// Memory copies from host to device of a memory block of 64 KB or less
-#ifdef PADDLE_WITH_HIP
-    memory::AllocationPtr data_alloc, cols_alloc;
-    data_alloc = memory::Alloc(platform::CUDAPinnedPlace(), o_num * sizeof(T*));
-    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
-    cols_alloc = memory::Alloc(platform::CUDAPinnedPlace(),
-                               (outputs_cols_num) * sizeof(int64_t));
-    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
-#endif
-
-    outputs_cols[0] = 0;
-    for (int i = 0; i < o_num; ++i) {
-      int64_t t_col = ref_inputs.at(i)->numel() / out_row;
-      if (has_same_shape) {
-        if (t_col != out0_col) has_same_shape = false;
-      }
-      in_col += t_col;
-      outputs_cols[i + 1] = in_col;
-      if (outputs->at(i) != nullptr) {
-        outputs_data[i] = outputs->at(i)->data<T>();
-      } else {
-        outputs_data[i] = nullptr;
-      }
-    }
-
-    dim3 block_dims;
-    dim3 grid_dims;
-    GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
-
-    memory::allocation::AllocationPtr tmp_dev_outs_data;
-    T** dev_out_gpu_data = nullptr;
-    if (!has_same_shape || o_num < 2 || o_num > 4) {
-      tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
-      auto* restored =
-          platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num);
-      memory::Copy(context.GetPlace(), tmp_dev_outs_data->ptr(),
-                   platform::CPUPlace(), restored, o_num * sizeof(T*),
-                   context.stream());
-      dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
-    }
-
-    if (has_same_shape) {
-      if (o_num == 2) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1]);
-      } else if (o_num == 3) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2]);
-      } else if (o_num == 4) {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
-            outputs_data[1], outputs_data[2], outputs_data[3]);
-      } else {
-        SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-            input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
-      }
-    } else {
-      auto tmp_dev_ins_col_data =
-          memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
-      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
-          outputs_cols, outputs_cols_num);
-      memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(),
-                   platform::CPUPlace(), restored,
-                   outputs_cols_num * sizeof(int64_t), context.stream());
-      int64_t* dev_outs_col_data =
-          reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
-
-      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
-          input.data<T>(), in_row, in_col, dev_outs_col_data,
-          static_cast<int>(outputs_cols_num), dev_out_gpu_data);
-    }
-#ifdef PADDLE_WITH_HIP
-    // Prevent the pinned memory value from being covered and release the memory
-    // after the launch kernel of the stream is executed (reapply pinned memory
-    // next time)
-    auto* data_alloc_released = data_alloc.release();
-    auto* cols_alloc_released = cols_alloc.release();
-    context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
-      memory::allocation::Allocator::AllocationDeleter(cols_alloc_released);
-    });
-#endif
+    std::vector<const pten::DenseTensor*> pt_ref_inputs{ref_inputs.begin(),
+                                                        ref_inputs.end()};
+    std::vector<pten::DenseTensor*> pt_outputs{outputs->begin(),
+                                               outputs->end()};
+    pten::SplitImpl<T, platform::CUDADeviceContext>(
+        context, input, pt_ref_inputs, axis, &pt_outputs);
   }
 };
 
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 653283b604f07..5ebaefcf808c3 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
+#include "paddle/pten/core/lod_utils.h"
+
 namespace pten {
 class DenseTensor;
 }  // namespace pten
@@ -122,7 +124,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
           input->lod(), *in_idx, (*in_idx) + 1, 0);
       auto &lod_length = lod_and_offset.first;
 
-      framework::AppendLoD(out_lod, lod_length);
+      pten::AppendLoD(out_lod, lod_length);
 
       size_t start_offset = lod_and_offset.second.first;
       size_t end_offset = lod_and_offset.second.second;
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index f39a1c0a39d6e..493073fadc2bd 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#include "paddle/pten/core/lod_utils.h"
+
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -73,7 +75,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
                                                               dst_num_rows, 0);
       height = lod_offset.second.second;
       auto out_lod = out_tensor.mutable_lod();
-      framework::AppendLoD(out_lod, lod_offset.first);
+      pten::AppendLoD(out_lod, lod_offset.first);
     }
 
     if (dst_num_rows != 0) {
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 9c22fa4797219..4cb2a292018f6 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/core/lod_utils.h"
 
 namespace pten {
 class DenseTensor;
@@ -96,7 +97,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
               x_lod, start_idx, start_idx + 1, level);
 
           auto &lod_length = lod_and_offset.first;
-          framework::AppendLoD(lod, lod_length);
+          pten::AppendLoD(lod, lod_length);
 
           size_t start_offset = lod_and_offset.second.first;
           size_t end_offset = lod_and_offset.second.second;
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index cdbfa11abec72..454e3b524f5f1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -43,7 +43,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/generate_pass.h"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
 #include "paddle/fluid/framework/op_info.h"
@@ -75,6 +74,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/cuda_streams_py.h"
+#include "paddle/pten/core/lod_utils.h"
 #ifndef PADDLE_ON_INFERENCE
 #include "paddle/fluid/pybind/eager.h"
 #endif
@@ -1093,7 +1093,7 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("recursive_sequence_lengths",
            [](framework::Tensor &self) -> std::vector<std::vector<size_t>> {
              // output the length-based lod info
-             LoD lod = ConvertToLengthBasedLoD(self.lod());
+             LoD lod = pten::ConvertToLengthBasedLoD(self.lod());
              std::vector<std::vector<size_t>> new_lod;
              new_lod.reserve(lod.size());
              std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 9b6e5d70cd899..cde5e719e316d 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -18,7 +18,7 @@ add_subdirectory(ops)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta)
+set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils)
 get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
 # keep this message for debug, remove it later if needless
 message(STATUS "All standard pten kernels: ${pten_kernels}")
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index 0b17415a6a98d..e3929d59159c1 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -38,6 +38,11 @@ using cast_kernel = void (*)(const DeviceContext&,
                              DataType,
                              DenseTensor*);
 
+using concat_kernel = void (*)(const DeviceContext&,
+                               const std::vector<DenseTensor>&,
+                               const Scalar&,
+                               DenseTensor*);
+
 using divide_kernel = void (*)(const DeviceContext&,
                                const DenseTensor&,
                                const DenseTensor&,
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
index 1420810007d1c..2e94d508aec7d 100644
--- a/paddle/pten/api/lib/utils/tensor_utils.cc
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -38,6 +38,11 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensorBase(
                              src.dims(),
                              src.layout(),
                              src.offset()};
+  if (!src.IsInitialized()) {
+    return std::make_unique<pten::DenseTensor>(
+        std::move(pten::make_intrusive<SharedStorage>(src.place())),
+        std::move(meta));
+  }
   auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
   return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                              std::move(meta));
@@ -247,7 +252,9 @@ std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
 
   if (variable.IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.Get<framework::LoDTensor>();
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
+
+    if (tensor.IsInitialized() &&
+        !platform::is_same_place(tensor.place(), expected_place)) {
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
       return MakePtenDenseTensor(tmp_tensor);
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index eabc5a19babad..d89b3c9fefb59 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -12,6 +12,7 @@ cc_library(arg_map_context SRCS arg_map_context.cc DEPS enforce)
 
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
 cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector)
+cc_library(lod_utils SRCS lod_utils.cc DEPS enforce mixed_vector)
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base)
 cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base )
 
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 5559b348aa1c9..5dd2bf367b3b8 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -92,7 +92,7 @@ class KernelContext {
   std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
     std::vector<TensorType> v;
     for (size_t i = start; i < end; ++i) {
-      auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
+      auto t = static_cast<const TensorType*>(inputs_.at(i));
       v.emplace_back(*t);
       inputs_.at(i) = nullptr;
     }
diff --git a/paddle/pten/core/lod_utils.cc b/paddle/pten/core/lod_utils.cc
new file mode 100644
index 0000000000000..ad5ea6d39d39c
--- /dev/null
+++ b/paddle/pten/core/lod_utils.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/core/lod_utils.h"
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pten {
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      paddle::platform::errors::InvalidArgument(
+          "The input LoD length should be equal to the appended LoD size, but "
+          "received input LoD length is %d, actual LoD size is %d.",
+          lod_length.size(),
+          lod->size()));
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(1, 0);  // size = 1, value = 0;
+    }
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+LoD ConvertToLengthBasedLoD(const LoD &offset_lod) {
+  LoD length_lod;
+  length_lod.reserve(offset_lod.size());
+  for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) {
+    std::vector<size_t> level;
+    if (offset_lod[lvl].size() > 0) {
+      level.reserve(offset_lod[lvl].size() - 1);
+    }
+    for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) {
+      level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]);
+    }
+    length_lod.push_back(level);
+  }
+  return length_lod;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/core/lod_utils.h b/paddle/pten/core/lod_utils.h
new file mode 100644
index 0000000000000..4c2547a43c027
--- /dev/null
+++ b/paddle/pten/core/lod_utils.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/mixed_vector.h"
+
+namespace pten {
+using LoD = std::vector<paddle::framework::Vector<size_t>>;
+
+void AppendLoD(LoD* lod, const LoD& lod_length);
+
+/*
+ * Convert between length-based LoD and offset-based LoD.
+ * The implementation of LoDTensor class use offset-based LoD.
+ * However, we want to expose the more user-friendly length-based
+ * LoD to the Python side instead.
+ *
+ * Example:
+ * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]]
+ * then length_lod = [[2, 1], [3, 2, 4]]
+ */
+LoD ConvertToLengthBasedLoD(const LoD& offset_lod);
+
+}  // namespace  pten
diff --git a/paddle/pten/infermeta/multiary.cc b/paddle/pten/infermeta/multiary.cc
index 5dbf3d58a1952..ecd0396a28688 100644
--- a/paddle/pten/infermeta/multiary.cc
+++ b/paddle/pten/infermeta/multiary.cc
@@ -14,4 +14,43 @@ limitations under the License. */
 
 #include "paddle/pten/infermeta/multiary.h"
 
-namespace pten {}  // namespace pten
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/kernels/funcs/concat_funcs.h"
+namespace pten {
+
+DenseTensorMeta ConcatInferMeta(const std::vector<DenseTensorMeta>& x_meta,
+                                const Scalar& axis_scalar,
+                                bool is_runtime) {
+  PADDLE_ENFORCE_GE(x_meta.size(),
+                    0,
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of input meta vector should be greater"
+                        "than 0."));
+
+  int axis = axis_scalar.to<int>();
+  // 1. calculate axis
+  int rank = x_meta[0].dims.size();
+  PADDLE_ENFORCE_EQ(
+      axis >= -rank && axis < rank,
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The axis is expected to be in range of [%d, %d), but got %d",
+          -rank,
+          rank,
+          axis));
+  if (axis < 0) {
+    axis = axis + rank;
+  }
+
+  // 2. calculate out dims
+  std::vector<pten::DDim> x_dims;
+  for (auto meta : x_meta) {
+    x_dims.push_back(meta.dims);
+  }
+  pten::DDim out_dim =
+      pten::funcs::ComputeAndCheckShape(is_runtime, x_dims, axis);
+
+  return {x_meta[0].dtype, out_dim, x_meta[0].layout};
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infermeta/multiary.h b/paddle/pten/infermeta/multiary.h
index 6aa15159630bc..f8d5468e50d47 100644
--- a/paddle/pten/infermeta/multiary.h
+++ b/paddle/pten/infermeta/multiary.h
@@ -14,4 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-namespace pten {}  // namespace pten
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/tensor_meta.h"
+namespace pten {
+
+// TODO(chentianyu03) use std::vector<DenseTensor> as InferMeta inputs
+DenseTensorMeta ConcatInferMeta(const std::vector<DenseTensorMeta>& x_meta,
+                                const Scalar& axis_scalar,
+                                bool is_runtime);
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 45724e5d22abd..76e112808892d 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -24,7 +24,7 @@ endif()
 # pten depends all pten kernel targets
 set_property(GLOBAL PROPERTY PTEN_KERNELS "")
 
-set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils)
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils)
diff --git a/paddle/pten/kernels/concat_kernel.h b/paddle/pten/kernels/concat_kernel.h
new file mode 100644
index 0000000000000..310b9ba8c0c4c
--- /dev/null
+++ b/paddle/pten/kernels/concat_kernel.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/infermeta/multiary.h"
+#include "paddle/pten/kernels/empty_kernel.h"
+namespace pten {
+
+template <typename T, typename Context>
+void ConcatKernel(const Context& dev_ctx,
+                  const std::vector<DenseTensor>& x,
+                  const Scalar& axis,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+DenseTensor Concat(const Context& dev_ctx,
+                   const std::vector<DenseTensor>& x,
+                   const Scalar& axis) {
+  std::vector<DenseTensorMeta> x_meta;
+  for (auto t : x) {
+    x_meta.push_back(t.meta());
+  }
+
+  auto out_meta = ConcatInferMeta(x_meta, axis.to<int>(), true);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  return dense_out;
+}
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/concat_and_split.h b/paddle/pten/kernels/cpu/concat_and_split.h
new file mode 100644
index 0000000000000..664ec6f66fc99
--- /dev/null
+++ b/paddle/pten/kernels/cpu/concat_and_split.h
@@ -0,0 +1,138 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+
+/*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
+ *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
+ */
+
+template <typename T, typename Context>
+void ConcatImpl(const Context& context,
+                const std::vector<DenseTensor>& input,
+                int axis,
+                DenseTensor* output) {
+  // TODO(zcd): Add input data validity checking
+  size_t num = input.size();
+
+  int64_t rows = 1;
+  auto dim_0 = input[0].dims();
+  for (int i = 0; i < axis; ++i) {
+    rows *= dim_0[i];
+  }
+  int64_t out_rows = rows, out_cols = 0;
+
+  std::vector<int64_t> input_cols(input.size());
+  for (size_t i = 0; i < num; ++i) {
+    int64_t t_cols = input[i].numel() / rows;
+    out_cols += t_cols;
+    input_cols[i] = t_cols;
+  }
+  auto cpu_place = context.GetPlace();
+
+  // computation
+  auto output_data = output->data<T>();
+  int64_t col_idx = 0;
+  for (size_t j = 0; j < num; ++j) {
+    int64_t col_len = input_cols[j];
+    auto input_data = input[j].data<T>();
+    for (int64_t k = 0; k < out_rows; ++k) {
+      paddle::memory::Copy(cpu_place,
+                           output_data + k * out_cols + col_idx,
+                           cpu_place,
+                           input_data + k * col_len,
+                           sizeof(T) * col_len);
+    }
+    col_idx += col_len;
+  }
+}
+
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
+template <typename T, typename Context>
+void SplitImpl(const Context& context,
+               const DenseTensor& input,
+               const std::vector<const DenseTensor*>& ref_inputs,
+               const int axis,
+               std::vector<DenseTensor*>* outputs) {
+  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+  // tensors of shape [0,1,4]
+  if (input.numel() == 0) {
+    return;
+  }
+
+  // TODO(zcd): Add input data validity checking
+  size_t num = outputs->size();
+
+  int input_rows = 1;
+  auto dim_0 = ref_inputs[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    input_rows *= dim_0[i];
+  }
+
+  int input_cols = 0;
+
+  std::vector<int64_t> output_cols(outputs->size());
+  for (size_t i = 0; i < num; ++i) {
+    int t_cols = ref_inputs[i]->numel() / input_rows;
+    input_cols += t_cols;
+    output_cols[i] = t_cols;
+  }
+  auto cpu_place = context.GetPlace();
+
+  // computation
+  for (int k = 0; k < input_rows; ++k) {
+    const T* src_ptr = input.data<T>() + k * input_cols;
+    int col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int col_len = output_cols[j];
+      auto* out_tensor = outputs->at(j);
+      if (out_tensor != nullptr) {
+        T* dst_ptr = out_tensor->data<T>() + k * col_len;
+        paddle::memory::Copy(cpu_place,
+                             dst_ptr,
+                             cpu_place,
+                             src_ptr + col_idx,
+                             sizeof(T) * col_len);
+      }
+      col_idx += col_len;
+    }
+  }
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc
new file mode 100644
index 0000000000000..fb59c9c6005ff
--- /dev/null
+++ b/paddle/pten/kernels/cpu/concat_kernel.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/concat_kernel.h"
+
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/lod_utils.h"
+#include "paddle/pten/kernels/cpu/concat_and_split.h"
+#include "paddle/pten/kernels/funcs/concat_funcs.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void ConcatKernel(const Context& dev_ctx,
+                  const std::vector<DenseTensor>& x,
+                  const Scalar& axis_scalar,
+                  DenseTensor* out) {
+  int64_t axis = axis_scalar.to<int64_t>();
+
+  axis = pten::funcs::ComputeAxis(axis, x[0].dims().size());
+
+  std::vector<pten::DDim> x_dims;
+  for (size_t i = 0; i < x.size(); ++i) {
+    x_dims.push_back(x[i].dims());
+  }
+
+  pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis);
+  out->Resize(out_dims);
+  out->mutable_data<T>();
+
+  // If axis is 0, the lod of the output is not the same as inputs.
+  if (axis == 0 && x[0].lod().size() > 0) {
+    size_t lod_size_0 = x[0].lod().size();
+    size_t lod_size = lod_size_0;
+    for (size_t i = 1; i < x.size(); ++i) {
+      if (x[i].lod().size() > 0) {
+        PADDLE_ENFORCE_EQ(
+            x[i].lod().size(),
+            lod_size_0,
+            paddle::platform::errors::Unimplemented(
+                "The lod level of all input LoDTensors should be same. "
+                "Maybe different lod level of input LoDTensors can concat,"
+                "it is not supported currently. The lod level of %dth input "
+                "is %d and first input is %d.",
+                i,
+                x[i].lod().size(),
+                lod_size_0));
+      } else {
+        lod_size = 0;
+        break;
+      }
+    }
+    if (lod_size) {
+      auto* out_lod = out->mutable_lod();
+      for (size_t i = 1; i < x.size(); ++i) {
+        auto in_lod = pten::ConvertToLengthBasedLoD(x[i].lod());
+        pten::AppendLoD(out_lod, in_lod);
+      }
+    }
+  }
+
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && x.size() < 10) {
+    size_t output_offset = 0;
+    for (auto& in : x) {
+      if (in.numel() == 0UL) {
+        continue;
+      }
+      auto in_stride = paddle::framework::stride_numel(in.dims());
+      auto out_stride = paddle::framework::stride_numel(out->dims());
+      paddle::operators::StridedNumelCopyWithAxis<T>(
+          dev_ctx,
+          axis,
+          out->data<T>() + output_offset,
+          out_stride,
+          in.data<T>(),
+          in_stride,
+          in_stride[axis]);
+      output_offset += in_stride[axis];
+    }
+  } else {
+    std::vector<pten::DenseTensor> inputs;
+    for (size_t j = 0; j < x.size(); ++j) {
+      if (x[j].numel() > 0) {
+        inputs.push_back(x[j]);
+      } else {
+        continue;
+      }
+    }
+    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(concat,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::ConcatKernel,
+                   float,
+                   double,
+                   bool,
+                   int64_t,
+                   int,
+                   uint8_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/funcs/concat_funcs.h b/paddle/pten/kernels/funcs/concat_funcs.h
new file mode 100644
index 0000000000000..8455b8096922c
--- /dev/null
+++ b/paddle/pten/kernels/funcs/concat_funcs.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+namespace pten {
+namespace funcs {
+
+static inline int64_t ComputeAxis(int64_t axis, int64_t rank) {
+  PADDLE_ENFORCE_EQ(
+      axis >= -rank && axis < rank,
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The axis is expected to be in range of [%d, %d), but got %d",
+          -rank,
+          rank,
+          axis));
+  if (axis < 0) {
+    axis = axis + rank;
+  }
+  return axis > 0 ? axis : 0;
+}
+
+static inline pten::DDim ComputeAndCheckShape(
+    const bool is_runtime,
+    const std::vector<pten::DDim>& inputs_dims,
+    const size_t axis) {
+  const size_t n = inputs_dims.size();
+  auto out_dims = inputs_dims[0];
+  size_t in_zero_dims_size = out_dims.size();
+  for (size_t i = 1; i < n; i++) {
+    PADDLE_ENFORCE_EQ(inputs_dims[i].size(),
+                      out_dims.size(),
+                      paddle::platform::errors::InvalidArgument(
+                          "The shape of input[0] and input[%d] "
+                          "is expected to be equal."
+                          "But received input[0]'s shape = "
+                          "[%s], input[%d]'s shape = [%s].",
+                          i,
+                          inputs_dims[0],
+                          i,
+                          inputs_dims[i]));
+    for (size_t j = 0; j < in_zero_dims_size; j++) {
+      if (j == axis) {
+        if (is_runtime) {
+          out_dims[axis] += inputs_dims[i][j];
+        } else {
+          if (inputs_dims[i][j] == -1 || out_dims[j] == -1) {
+            out_dims[axis] = -1;
+          } else {
+            out_dims[axis] += inputs_dims[i][j];
+          }
+        }
+      } else {
+        bool check_shape =
+            is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0);
+        if (check_shape) {
+          // check all shape in run time
+          PADDLE_ENFORCE_EQ(inputs_dims[0][j],
+                            inputs_dims[i][j],
+                            paddle::platform::errors::InvalidArgument(
+                                "The %d-th dimension of input[0] and input[%d] "
+                                "is expected to be equal."
+                                "But received input[0]'s shape = "
+                                "[%s], input[%d]'s shape = [%s].",
+                                j,
+                                i,
+                                inputs_dims[0],
+                                i,
+                                inputs_dims[i]));
+        }
+        if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) {
+          out_dims[j] = inputs_dims[i][j];
+        }
+      }
+    }
+  }
+  return out_dims;
+}
+
+}  // namespace funcs
+}  // namespace  pten
diff --git a/paddle/pten/kernels/gpu/concat_and_split.h b/paddle/pten/kernels/gpu/concat_and_split.h
new file mode 100644
index 0000000000000..66b21b5f51351
--- /dev/null
+++ b/paddle/pten/kernels/gpu/concat_and_split.h
@@ -0,0 +1,569 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "gflags/gflags.h"
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+#include "paddle/pten/backends/gpu/gpu_context.h"
+
+namespace pten {
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs,
+                              const int64_t* input_cols,
+                              int col_size,
+                              const int64_t output_rows,
+                              const int64_t output_cols,
+                              T* output) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = input_cols[0];
+  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = input_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = input_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+
+    const T* input_ptr = inputs[curr_segment];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+      output[tid_y * output_cols + tid_x] =
+          input_ptr[tid_y * segment_width + local_col];
+  }
+}
+
+template <typename T>
+__device__ void ConcatKernelDetail(const T** inputs_data,
+                                   const int fixed_in_col,
+                                   const int out_rows,
+                                   const int out_cols,
+                                   T* output_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x * 1.0 / fixed_in_col;
+    int in_offset = tid_x - split * fixed_in_col;
+    const T* input_ptr = inputs_data[split];
+    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+    for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
+      output_data[tid_y * out_cols + tid_x] =
+          input_ptr[tid_y * fixed_in_col + in_offset];
+    }
+  }
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[2];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[3];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T* input_addr0,
+                              const T* input_addr1,
+                              const T* input_addr2,
+                              const T* input_addr3,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  const T* inputs_data[4];
+  inputs_data[0] = input_addr0;
+  inputs_data[1] = input_addr1;
+  inputs_data[2] = input_addr2;
+  inputs_data[3] = input_addr3;
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void ConcatKernel_(const T** inputs_data,
+                              const int in_num,
+                              const int64_t fixed_in_col,
+                              const int64_t out_rows,
+                              const int64_t out_cols,
+                              T* output_data) {
+  ConcatKernelDetail<T>(
+      inputs_data, fixed_in_col, out_rows, out_cols, output_data);
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data,
+                            const int64_t in_row,
+                            const int64_t in_col,
+                            const int64_t* out_cols,
+                            int out_cols_size,
+                            T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  int curr_segment = 0;
+  int curr_offset = out_cols[0];
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int curr_col_offset = out_cols[curr_segment + 1];
+    while (curr_col_offset <= tid_x) {
+      curr_offset = curr_col_offset;
+      ++curr_segment;
+      curr_col_offset = out_cols[curr_segment + 1];
+    }
+
+    int local_col = tid_x - curr_offset;
+    int segment_width = curr_col_offset - curr_offset;
+    T* output_ptr = outputs_data[curr_segment];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * segment_width + local_col] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__device__ void SplitKernelDetail(const T* input_data,
+                                  const int in_row,
+                                  const int in_col,
+                                  const int fixed_out_col,
+                                  T** outputs_data) {
+  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
+    int split = tid_x / fixed_out_col;
+    int in_offset = tid_x - split * fixed_out_col;
+    T* output_ptr = outputs_data[split];
+    if (output_ptr != nullptr) {
+      int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+      for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
+        output_ptr[tid_y * fixed_out_col + in_offset] =
+            input_data[tid_y * in_col + tid_x];
+    }
+  }
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data,
+                            const int64_t in_row,
+                            const int64_t in_col,
+                            const int64_t fixed_out_col,
+                            T** outputs_data) {
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data,
+                            const int64_t in_row,
+                            const int64_t in_col,
+                            const int64_t fixed_out_col,
+                            T* outputs_addr0,
+                            T* outputs_addr1) {
+  T* outputs_data[2];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data,
+                            const int64_t in_row,
+                            const int64_t in_col,
+                            const int64_t fixed_out_col,
+                            T* outputs_addr0,
+                            T* outputs_addr1,
+                            T* outputs_addr2) {
+  T* outputs_data[3];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+template <typename T>
+__global__ void SplitKernel(const T* input_data,
+                            const int64_t in_row,
+                            const int64_t in_col,
+                            const int64_t fixed_out_col,
+                            T* outputs_addr0,
+                            T* outputs_addr1,
+                            T* outputs_addr2,
+                            T* outputs_addr3) {
+  T* outputs_data[4];
+  outputs_data[0] = outputs_addr0;
+  outputs_data[1] = outputs_addr1;
+  outputs_data[2] = outputs_addr2;
+  outputs_data[3] = outputs_addr3;
+  SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
+}
+
+static inline void GetBlockDims(
+    const paddle::platform::CUDADeviceContext& context,
+    int64_t num_rows,
+    int64_t num_cols,
+    dim3* block_dims,
+    dim3* grid_dims) {
+  // Set the thread block and grid according to CurrentDeviceId
+  const int kThreadsPerBlock = 1024;
+  int block_cols = kThreadsPerBlock;
+  if (num_cols < kThreadsPerBlock) {  // block_cols is aligned by 32.
+    block_cols = ((num_cols + 31) >> 5) << 5;
+  }
+  int block_rows = kThreadsPerBlock / block_cols;
+  *block_dims = dim3(block_cols, block_rows, 1);
+
+  int max_threads = context.GetMaxPhysicalThreadCount();
+  int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
+
+  int grid_cols =
+      std::min((num_cols + block_cols - 1) / block_cols, max_blocks);
+  int grid_rows = std::min(max_blocks / grid_cols,
+                           std::max(num_rows / block_rows, (int64_t)1));
+  *grid_dims = dim3(grid_cols, grid_rows, 1);
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T, typename Context>
+void ConcatImpl(const Context& context,
+                const std::vector<pten::DenseTensor>& input,
+                int axis,
+                pten::DenseTensor* output) {
+  // TODO(zcd): Add input data validity checking
+  int in_num = input.size();
+  int64_t in_row = 1;
+  auto dim_0 = input[0].dims();
+  for (int i = 0; i < axis; ++i) {
+    in_row *= dim_0[i];
+  }
+  int64_t in_col = input[0].numel() / in_row;
+  int64_t out_row = in_row, out_col = 0;
+
+  int inputs_col_num = in_num + 1;
+  std::vector<const T*> inputs_data_vec(in_num);
+  std::vector<int64_t> inputs_col_vec(inputs_col_num);
+  const T** inputs_data = inputs_data_vec.data();
+  int64_t* inputs_col = inputs_col_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+  paddle::memory::AllocationPtr data_alloc, col_alloc;
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     in_num * sizeof(T*));
+  inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                    inputs_col_num * sizeof(int));
+  inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+#endif
+
+  inputs_col[0] = 0;
+  bool has_same_shape = true;
+  for (int i = 0; i < in_num; ++i) {
+    int64_t t_cols = input[i].numel() / in_row;
+    if (has_same_shape) {
+      if (t_cols != in_col) has_same_shape = false;
+    }
+    out_col += t_cols;
+    inputs_col[i + 1] = out_col;
+    inputs_data[i] = input[i].data<T>();
+  }
+
+  dim3 block_dims;
+  dim3 grid_dims;
+  GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims);
+
+  paddle::memory::allocation::AllocationPtr tmp_dev_ins_data;
+  const T** dev_ins_data = nullptr;
+  if (!has_same_shape || in_num < 2 || in_num > 4) {
+    tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*));
+    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+        inputs_data, in_num);
+    paddle::memory::Copy(context.GetPlace(),
+                         tmp_dev_ins_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         restored,
+                         in_num * sizeof(T*),
+                         context.stream());
+    dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
+  }
+
+  if (has_same_shape) {
+    if (in_num == 2) {
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          inputs_data[0],
+          inputs_data[1],
+          in_col,
+          out_row,
+          out_col,
+          output->data<T>());
+    } else if (in_num == 3) {
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          inputs_data[0],
+          inputs_data[1],
+          inputs_data[2],
+          in_col,
+          out_row,
+          out_col,
+          output->data<T>());
+    } else if (in_num == 4) {
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          inputs_data[0],
+          inputs_data[1],
+          inputs_data[2],
+          inputs_data[3],
+          in_col,
+          out_row,
+          out_col,
+          output->data<T>());
+    } else {
+      ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+          dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
+    }
+  } else {
+    auto tmp_dev_ins_col_data =
+        paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+
+    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+        inputs_col, inputs_col_num);
+    paddle::memory::Copy(context.GetPlace(),
+                         tmp_dev_ins_col_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         restored,
+                         inputs_col_num * sizeof(int64_t),
+                         context.stream());
+    int64_t* dev_ins_col_data =
+        static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+    ConcatKernel_<<<grid_dims, block_dims, 0, context.stream()>>>(
+        dev_ins_data,
+        dev_ins_col_data,
+        static_cast<int>(inputs_col_num),
+        out_row,
+        out_col,
+        output->data<T>());
+  }
+
+#ifdef PADDLE_WITH_HIP
+  // Prevent the pinned memory value from being covered and release the memory
+  // after the launch kernel of the stream is executed (reapply pinned memory
+  // next time)
+  auto* data_alloc_released = data_alloc.release();
+  auto* col_alloc_released = col_alloc.release();
+  context.AddStreamCallback([data_alloc_released, col_alloc_released] {
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        data_alloc_released);
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        col_alloc_released);
+  });
+#endif
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T, typename Context>
+void SplitImpl(const Context& context,
+               const pten::DenseTensor& input,
+               const std::vector<const pten::DenseTensor*>& ref_inputs,
+               int axis,
+               std::vector<pten::DenseTensor*>* outputs) {
+  // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3
+  // tensors of shape [0,1,4]
+  if (input.numel() == 0) {
+    return;
+  }
+
+  // TODO(zcd): Add input data validity checking
+  int o_num = outputs->size();
+  int64_t out_row = 1;
+  auto dim_0 = ref_inputs[0]->dims();
+  for (int i = 0; i < axis; ++i) {
+    out_row *= dim_0[i];
+  }
+
+  int64_t out0_col = ref_inputs[0]->numel() / out_row;
+  int64_t in_col = 0, in_row = out_row;
+  bool has_same_shape = true;
+
+  int outputs_cols_num = o_num + 1;
+  std::vector<T*> outputs_data_vec(o_num);
+  std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
+  T** outputs_data = outputs_data_vec.data();
+  int64_t* outputs_cols = outputs_cols_vec.data();
+
+// There are some differences between hip runtime and NV runtime.
+// In NV, when the pageable memory data less than 64K is transferred from
+// hosttodevice, it will be automatically asynchronous.
+// However, only pinned memory in hip can copy asynchronously
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device
+// 3.2.6.1. Concurrent Execution between Host and Device
+// Memory copies from host to device of a memory block of 64 KB or less
+#ifdef PADDLE_WITH_HIP
+  paddle::memory::AllocationPtr data_alloc, cols_alloc;
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     o_num * sizeof(T*));
+  outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+  // TODO(chentianyu03): try to find a method to remove the Alloc function
+  cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
+                                     (outputs_cols_num) * sizeof(int64_t));
+  outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+#endif
+
+  outputs_cols[0] = 0;
+  for (int i = 0; i < o_num; ++i) {
+    int64_t t_col = ref_inputs.at(i)->numel() / out_row;
+    if (has_same_shape) {
+      if (t_col != out0_col) has_same_shape = false;
+    }
+    in_col += t_col;
+    outputs_cols[i + 1] = in_col;
+    if (outputs->at(i) != nullptr) {
+      outputs_data[i] = outputs->at(i)->data<T>();
+    } else {
+      outputs_data[i] = nullptr;
+    }
+  }
+
+  dim3 block_dims;
+  dim3 grid_dims;
+  GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims);
+
+  paddle::memory::allocation::AllocationPtr tmp_dev_outs_data;
+  T** dev_out_gpu_data = nullptr;
+  if (!has_same_shape || o_num < 2 || o_num > 4) {
+    // TODO(chentianyu03): try to find a method to remove the Alloc function
+    tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*));
+    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+        outputs_data, o_num);
+    paddle::memory::Copy(context.GetPlace(),
+                         tmp_dev_outs_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         restored,
+                         o_num * sizeof(T*),
+                         context.stream());
+    dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
+  }
+
+  if (has_same_shape) {
+    if (o_num == 2) {
+      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(),
+          in_row,
+          in_col,
+          out0_col,
+          outputs_data[0],
+          outputs_data[1]);
+    } else if (o_num == 3) {
+      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(),
+          in_row,
+          in_col,
+          out0_col,
+          outputs_data[0],
+          outputs_data[1],
+          outputs_data[2]);
+    } else if (o_num == 4) {
+      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(),
+          in_row,
+          in_col,
+          out0_col,
+          outputs_data[0],
+          outputs_data[1],
+          outputs_data[2],
+          outputs_data[3]);
+    } else {
+      SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+          input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
+    }
+  } else {
+    auto tmp_dev_ins_col_data =
+        // TODO(chentianyu03): try to find a method to remove the Alloc function
+        paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+    auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph(
+        outputs_cols, outputs_cols_num);
+    paddle::memory::Copy(context.GetPlace(),
+                         tmp_dev_ins_col_data->ptr(),
+                         paddle::platform::CPUPlace(),
+                         restored,
+                         outputs_cols_num * sizeof(int64_t),
+                         context.stream());
+    int64_t* dev_outs_col_data =
+        reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
+
+    SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
+        input.data<T>(),
+        in_row,
+        in_col,
+        dev_outs_col_data,
+        static_cast<int>(outputs_cols_num),
+        dev_out_gpu_data);
+  }
+#ifdef PADDLE_WITH_HIP
+  // Prevent the pinned memory value from being covered and release the memory
+  // after the launch kernel of the stream is executed (reapply pinned memory
+  // next time)
+  auto* data_alloc_released = data_alloc.release();
+  auto* cols_alloc_released = cols_alloc.release();
+  context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        data_alloc_released);
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        cols_alloc_released);
+  });
+#endif
+}
+
+}  // namespace pten
diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu
new file mode 100644
index 0000000000000..6ddfef460fc6c
--- /dev/null
+++ b/paddle/pten/kernels/gpu/concat_kernel.cu
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/concat_kernel.h"
+
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/core/lod_utils.h"
+#include "paddle/pten/kernels/funcs/concat_funcs.h"
+#include "paddle/pten/kernels/gpu/concat_and_split.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void ConcatKernel(const Context& dev_ctx,
+                  const std::vector<DenseTensor>& x,
+                  const Scalar& axis_scalar,
+                  DenseTensor* out) {
+  int64_t axis = axis_scalar.to<int64_t>();
+
+  axis = pten::funcs::ComputeAxis(axis, x[0].dims().size());
+
+  std::vector<pten::DDim> x_dims;
+  for (size_t i = 0; i < x.size(); ++i) {
+    x_dims.push_back(x[i].dims());
+  }
+
+  pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis);
+  out->Resize(out_dims);
+  out->mutable_data<T>();
+
+  // If axis is 0, the lod of the output is not the same as inputs.
+  if (axis == 0 && x[0].lod().size() > 0) {
+    size_t lod_size_0 = x[0].lod().size();
+    size_t lod_size = lod_size_0;
+    for (size_t i = 1; i < x.size(); ++i) {
+      if (x[i].lod().size() > 0) {
+        PADDLE_ENFORCE_EQ(
+            x[i].lod().size(),
+            lod_size_0,
+            paddle::platform::errors::Unimplemented(
+                "The lod level of all input LoDTensors should be same. "
+                "Maybe different lod level of input LoDTensors can concat,"
+                "it is not supported currently. The lod level of %dth input "
+                "is %d and first input is %d.",
+                i,
+                x[i].lod().size(),
+                lod_size_0));
+      } else {
+        lod_size = 0;
+        break;
+      }
+    }
+    if (lod_size) {
+      auto* out_lod = out->mutable_lod();
+      for (size_t i = 1; i < x.size(); ++i) {
+        auto in_lod = pten::ConvertToLengthBasedLoD(x[i].lod());
+        pten::AppendLoD(out_lod, in_lod);
+      }
+    }
+  }
+
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
+  if (axis == 0 && x.size() < 10) {
+    size_t output_offset = 0;
+    for (auto& in : x) {
+      if (in.numel() == 0UL) {
+        continue;
+      }
+      auto in_stride = paddle::framework::stride_numel(in.dims());
+      auto out_stride = paddle::framework::stride_numel(out->dims());
+      paddle::operators::StridedNumelCopyWithAxis<T>(
+          dev_ctx,
+          axis,
+          out->data<T>() + output_offset,
+          out_stride,
+          in.data<T>(),
+          in_stride,
+          in_stride[axis]);
+      output_offset += in_stride[axis];
+    }
+  } else {
+    std::vector<pten::DenseTensor> inputs;
+    for (size_t j = 0; j < x.size(); ++j) {
+      if (x[j].numel() > 0) {
+        inputs.push_back(x[j]);
+      } else {
+        continue;
+      }
+    }
+    ConcatImpl<T, Context>(dev_ctx, inputs, axis, out);
+  }
+}
+
+}  // namespace pten
+
+PT_REGISTER_KERNEL(concat,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::ConcatKernel,
+                   float,
+                   double,
+                   bool,
+                   int64_t,
+                   int,
+                   uint8_t,
+                   paddle::platform::float16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index 79d9a3d82e69e..e9faa22c4eb7b 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -21,3 +21,4 @@ cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_uti
 cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_concat_api.cc b/paddle/pten/tests/api/test_concat_api.cc
new file mode 100644
index 0000000000000..e84aee0aaaf4f
--- /dev/null
+++ b/paddle/pten/tests/api/test_concat_api.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/api.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace paddle {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(chentianyu03): Remove this test after the API is used in the dygraph
+TEST(API, concat) {
+  // 1. create tensor
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc.get(),
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+    }
+  }
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  std::vector<paddle::experimental::Tensor> inputs{x, y};
+
+  // 2. test API
+  auto out = paddle::experimental::concat(inputs, 0);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 6);
+  ASSERT_EQ(out.dims()[1], 10);
+  ASSERT_EQ(out.numel(), 60);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto out_data = dense_out->data<float>();
+  for (size_t i = 0; i < 60; ++i) {
+    if (i < 30) {
+      ASSERT_NEAR(dense_x_data[i], out_data[i], 1e-6f);
+    } else {
+      ASSERT_NEAR(dense_y_data[i - 30], out_data[i], 1e-6f);
+    }
+  }
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt
index 6f70f2ca2c895..407e5c097aec4 100644
--- a/paddle/pten/tests/kernels/CMakeLists.txt
+++ b/paddle/pten/tests/kernels/CMakeLists.txt
@@ -10,3 +10,4 @@ cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten
 cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils)
 cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils)
+cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils)
diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc
new file mode 100644
index 0000000000000..c5d979ad908ff
--- /dev/null
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/kernels/concat_kernel.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(DEV_API, concat) {
+  // 1. create tensor
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc.get(),
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  pten::DenseTensor dense_y(alloc.get(),
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y.mutable_data<float>();
+
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+    }
+  }
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  std::vector<pten::DenseTensor> inputs = {dense_x, dense_y};
+
+  // 2. test API
+  auto out = pten::Concat<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)), inputs, 0);
+
+  // 3. check result
+  ASSERT_EQ(out.dims().size(), 2);
+  ASSERT_EQ(out.dims()[0], 6);
+  ASSERT_EQ(out.dims()[1], 10);
+  ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32);
+  ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW);
+
+  auto out_data = out.data<float>();
+
+  for (size_t i = 0; i < 60; ++i) {
+    if (i < 30) {
+      ASSERT_NEAR(dense_x_data[i], out_data[i], 1e-6f);
+    } else {
+      ASSERT_NEAR(dense_y_data[i - 30], out_data[i], 1e-6f);
+    }
+  }
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 562a726aa29f2..1bf5344e83746 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -18,6 +18,16 @@
     param : [x, out_dtype]
     data_type : x
 
+
+- api : concat
+  args : (const std::vector<Tensor>& x, const Scalar& axis)
+  output : Tensor
+  infer_meta :
+    func : ConcatInferMeta
+    param : [x, axis, true]
+  kernel :
+    func : concat
+
 - api : conj
   args : (const Tensor& x)
   output : Tensor
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index e8539b11d1455..c994731585246 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -58,7 +58,10 @@ def parse_args(self, args_str):
             f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
         args_str = args_str[1:-1]
         args_list = args_str.split(',')
-        input_types = ['const Tensor&', 'const Tensor &']
+        input_types = [
+            'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
+            'const std::vector<Tensor> &'
+        ]
         attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
                       'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
                       'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
@@ -247,7 +250,7 @@ def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                param_code = param_code + self.prefix_tensor_name + param + "->meta(), "
+                param_code = param_code + "GetDenseTensorMeta(" + self.prefix_tensor_name + param + "), "
             elif param in attr_names:
                 param_code = param_code + param + ", "
             elif isinstance(param, str):
@@ -267,7 +270,7 @@ def get_kernel_args(self, input_names, attrs, kernel_param):
         for input_name in input_names:
             # set input code
             input_tensor_code = input_tensor_code + f"""
-  auto {self.prefix_tensor_name}{input_name} = std::dynamic_pointer_cast<pten::DenseTensor>({input_name}.impl());"""
+  auto {self.prefix_tensor_name}{input_name} = TensorToDenseTensor({input_name});"""
 
         attr_names = attrs['names']
         if kernel_param is None:
@@ -374,6 +377,35 @@ def api_namespace():
 """)
 
 
+def tensor_to_densetensor():
+    return """
+  std::shared_ptr<pten::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
+      return std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
+  }
+
+  std::shared_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(const std::vector<Tensor>& tensors) {
+      std::vector<pten::DenseTensor> pt_tensors;
+
+      for(auto & t : tensors) {
+          pt_tensors.push_back(*std::dynamic_pointer_cast<pten::DenseTensor>(t.impl()));
+      }
+      return std::make_shared<std::vector<pten::DenseTensor>>(pt_tensors);
+  }
+
+   const pten::DenseTensorMeta GetDenseTensorMeta(const std::shared_ptr<pten::DenseTensor> & x) {
+       return x->meta();
+   }
+
+   const std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(const std::shared_ptr<std::vector<pten::DenseTensor>>& x) {
+       std::vector<pten::DenseTensorMeta> metas;
+       for(auto& t : *x) {
+           metas.push_back(t.meta());
+       }
+       return metas;
+   }
+"""
+
+
 def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     with open(api_yaml_path, 'r') as f:
@@ -390,6 +422,7 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
     source_file.write(namespace[0])
+    source_file.write(tensor_to_densetensor())
 
     for api in apis:
         api_code = API(api)

From 854a7ab3589704499a8332b9967011c4457fd507 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Fri, 21 Jan 2022 18:46:24 +0800
Subject: [PATCH 13/15] add pten dependency to infrt (#39079)

* add pten dependency to infrt

* fix code style

* add pten::CPUContext

* revert .ignore
---
 paddle/infrt/CMakeLists.txt         |  7 ++++--
 paddle/infrt/host_context/value.h   |  5 ++++
 paddle/infrt/kernel/CMakeLists.txt  |  1 +
 paddle/infrt/kernel/pten_kernels.cc | 37 +++++++++++++++++++++++++++++
 paddle/infrt/kernel/pten_kernels.h  | 35 +++++++++++++++++++++++++++
 paddle/scripts/infrt_build.sh       |  0
 6 files changed, 83 insertions(+), 2 deletions(-)
 create mode 100644 paddle/infrt/kernel/pten_kernels.cc
 create mode 100644 paddle/infrt/kernel/pten_kernels.h
 mode change 100644 => 100755 paddle/scripts/infrt_build.sh

diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt
index 8af3012a220ad..e371e2391829d 100644
--- a/paddle/infrt/CMakeLists.txt
+++ b/paddle/infrt/CMakeLists.txt
@@ -1,3 +1,6 @@
+#TO DO:remove fluid
+include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform)
+
 if (NOT WITH_INFRT)
     return()
 endif()
@@ -88,8 +91,8 @@ set(infrt_mlir_incs
         )
 message(STATUS "infrt srcs:\n${infrt_src}")
 
-cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto)
-cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto)
+cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto pten dense_tensor)
+cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto pten dense_tensor)
 add_dependencies(infrt ${infrt_mlir_incs})
 
 add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS})
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index 4a2b92a7e69c5..7f68e59f8a698 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -29,6 +29,9 @@
 #include "paddle/infrt/tensor/tensor_map.h"
 #include "paddle/infrt/tensor/tensor_shape.h"
 
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/core/dense_tensor.h"
+
 namespace infrt {
 namespace host_context {
 
@@ -45,6 +48,8 @@ using ValueVariantType = Variant<int16_t,
                                  tensor::DenseHostTensor,
                                  MlirFunctionExecutable*,
                                  tensor::TensorMap,
+                                 pten::CPUContext,
+                                 pten::DenseTensor,
                                  std::vector<int16_t>,
                                  std::vector<int32_t>,
                                  std::vector<int64_t>,
diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt
index da858aad28f81..7e9ed8e5572c0 100644
--- a/paddle/infrt/kernel/CMakeLists.txt
+++ b/paddle/infrt/kernel/CMakeLists.txt
@@ -2,6 +2,7 @@ core_gather_headers()
 
 gather_srcs(infrt_src SRCS
     basic_kernels.cc
+    pten_kernels.cc
     test_kernels.cc
     tensor_shape_kernels.cc
     tensor_kernels.cc
diff --git a/paddle/infrt/kernel/pten_kernels.cc b/paddle/infrt/kernel/pten_kernels.cc
new file mode 100644
index 0000000000000..70c44b829f774
--- /dev/null
+++ b/paddle/infrt/kernel/pten_kernels.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <string>
+
+#include "paddle/infrt/host_context/kernel_registry.h"
+#include "paddle/infrt/host_context/kernel_utils.h"
+#include "paddle/infrt/kernel/pten_kernels.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/math_kernel.h"
+
+using infrt::host_context::Attribute;
+
+namespace infrt {
+namespace kernel {
+
+void RegisterPtenKernels(host_context::KernelRegistry* registry) {
+  registry->AddKernel("pd_cpu.add.float32",
+                      INFRT_KERNEL(pten::AddKernel<float, pten::CPUContext>));
+  registry->AddKernel("pd_cpu.add.int32",
+                      INFRT_KERNEL(pten::AddKernel<int, pten::CPUContext>));
+}
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/infrt/kernel/pten_kernels.h b/paddle/infrt/kernel/pten_kernels.h
new file mode 100644
index 0000000000000..c290f8ea524fb
--- /dev/null
+++ b/paddle/infrt/kernel/pten_kernels.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace infrt {
+namespace host_context {
+
+struct KernelRegistry;
+
+}  // namespace host_context
+}  // namespace infrt
+
+namespace infrt {
+namespace kernel {
+
+/**
+ * Register all the pten kernels to registry.
+ */
+void RegisterPtenKernels(host_context::KernelRegistry* registry);
+
+}  // namespace kernel
+}  // namespace infrt
diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh
old mode 100644
new mode 100755

From a0f586bc626b3fddcc104e46e521e37bc7e4e302 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 21 Jan 2022 20:03:11 +0800
Subject: [PATCH 14/15] [PTen]Separate origin Kernel and add Kernel for C++ API
 (#39002)

* add kernel for c++ api

* fix compile bugs

* fix kunlun compile bugs

* perfect cmake

* fix compile bugs when run ci-inference

* fix compile bugs

* add non-raw kernel for fluid op

* fix compile bugs

* fix compile bugs

* fix unit test bug
---
 cmake/pten_kernel.cmake                       |  61 +++--
 paddle/fluid/operators/cholesky_solve_op.h    |   2 +-
 .../elementwise/elementwise_add_op.h          |   2 +-
 .../elementwise/elementwise_div_op.h          |   2 +-
 .../elementwise/elementwise_mul_op.cu         |   4 +-
 .../elementwise/elementwise_mul_op.h          |   2 +-
 .../operators/elementwise/elementwise_op.h    |  24 +-
 .../elementwise/elementwise_sub_op.h          |   2 +-
 paddle/fluid/operators/lu_op.h                |   4 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |  13 +-
 paddle/pten/api/include/kernel_signature.h    |   6 -
 paddle/pten/core/kernel_alias_name.h          |  12 +-
 paddle/pten/kernels/cpu/math_kernel.cc        |  76 +++----
 paddle/pten/kernels/gpu/math_kernel.cu        |  77 ++++---
 paddle/pten/kernels/math_kernel.cc            | 212 ++++++++++++++++++
 paddle/pten/kernels/math_kernel.h             | 125 ++++++-----
 .../tests/kernels/test_elementwise_dev_api.cc |  12 +-
 python/paddle/utils/code_gen/api.yaml         |   7 +-
 18 files changed, 453 insertions(+), 190 deletions(-)
 create mode 100644 paddle/pten/kernels/math_kernel.cc

diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
index bc9fefb58f452..c2928376a02f8 100644
--- a/cmake/pten_kernel.cmake
+++ b/cmake/pten_kernel.cmake
@@ -103,38 +103,55 @@ function(kernel_library TARGET)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
 
-    if (${common_srcs_len} GREATER 0)
-        # If the kernel has a device independent public implementation,
-        # we will use this implementation and will not adopt the implementation
-        # under specific devices
+    # Build Target according different src organization
+    if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
+        ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
+        # If the common_srcs depends on specific device srcs, build target using this rule.
+        if (WITH_GPU)
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        elseif (WITH_ROCM)
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        else()
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
+            endif()
+        endif()
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
         if (WITH_GPU)
-            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         elseif (WITH_ROCM)
-            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         else()
-            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
+                cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         endif()
     else()
-        # If the kernel has a header file declaration, but no corresponding
-        # implementation can be found, this is not allowed
-        if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
-            ${xpu_srcs_len} EQUAL 0)
-            message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        if (${common_srcs_len} EQUAL 0)
+             message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
         else()
+            # If the kernel has a device independent public implementation,
+            # we will use this implementation and will not adopt the implementation
+            # under specific devices
             if (WITH_GPU)
-                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             elseif (WITH_ROCM)
-                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
-                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             else()
-                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
-                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
-                endif()
+                cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
-        endif()
+         endif()
     endif()
 
     if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h
index 4b1d075de91ca..5004aad7c59bc 100644
--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
       commonterm_for_range(commonterm_functor);
       commonterm_conj = helper.Transpose(commonterm_conj);
 
-      pten::AddKernel<T>(
+      pten::AddRawKernel<T>(
           static_cast<const typename paddle::framework::ConvertToPtenContext<
               DeviceContext>::TYPE &>(dev_ctx),
           commonterm, commonterm_conj, -1, &commonterm);
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index a4897a06d5611..5c4f791b2270c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::AddKernel<T>(
+    pten::AddRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE &>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
index 44f695278dca8..a45f09b63e9fe 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::DivideKernel<T>(
+    pten::DivideRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 86a803106347d..0c7d12ae0ad55 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -51,8 +51,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
-                              pt_z.get());
+      pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
+                                 pt_z.get());
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "X's type[%s] is not supported by elementwise_op. X's type should be "
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index d918407930d96..e7a5e48b1f1b5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
       auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
       auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
       auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
-      pten::MultiplyKernel<T>(
+      pten::MultiplyRawKernel<T>(
           static_cast<const typename framework::ConvertToPtenContext<
               DeviceContext>::TYPE&>(dev_ctx),
           *pt_x.get(), *pt_y.get(), axis, pt_z.get());
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index e1d9655e293a3..aaf33ca674488 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
+    int axis = ctx.Attr<int>("axis");
     if (Type() == "elementwise_add") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("add", {"X", "Y"}, {"axis"}, {"Out"});
+        if (axis == -1) {
+          return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"},
+                                          {"Out"});
       }
     }
     if (Type() == "elementwise_sub") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("subtract", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("subtract", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }
     if (Type() == "elementwise_div") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("divide", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
+        }
+        return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }
     if (Type() == "elementwise_mul") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
-        return framework::KernelSignature("multiply", {"X", "Y"}, {"axis"},
+        if (axis == -1) {
+          return framework::KernelSignature("multiply", {"X", "Y"}, {},
+                                            {"Out"});
+        }
+        return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"},
                                           {"Out"});
       }
     }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 46d4a93e804f5..7d1749f20abf2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
     auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
     auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
-    pten::SubtractKernel<T>(
+    pten::SubtractRawKernel<T>(
         static_cast<const typename framework::ConvertToPtenContext<
             DeviceContext>::TYPE&>(dev_ctx),
         *pt_x.get(), *pt_y.get(), axis, pt_z.get());
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index 6beef1add8e4c..c3b3552ba1329 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->Resize(src1.dims());
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  pten::AddKernel<
+  pten::AddRawKernel<
       T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
       static_cast<const typename paddle::framework::ConvertToPtenContext<
           DeviceContext>::TYPE&>(dev_ctx),
@@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
   out->Resize(src1.dims());
   out->mutable_data<T>(dev_ctx.GetPlace());
 
-  pten::SubtractKernel<
+  pten::SubtractRawKernel<
       T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
       static_cast<const typename paddle::framework::ConvertToPtenContext<
           DeviceContext>::TYPE&>(dev_ctx),
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e2002856a4d08..2e5bd7a42b1d1 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext& ctx) const override {
+    bool reduce_all = ctx.Attr<bool>("reduce_all");
     if (Type() == "reduce_sum") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
+          return framework::KernelSignature(
+              "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
+        }
         return framework::KernelSignature(
-            "sum", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
+            "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
             {"Out"});
       }
     }
     if (Type() == "reduce_mean") {
       if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        if (!reduce_all) {
+          return framework::KernelSignature("mean", {"X"}, {"dim", "keep_dim"},
+                                            {"Out"});
+        }
         return framework::KernelSignature(
-            "mean", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
+            "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
       }
     }
     // TODO(chentianyu03): support other cases after selected rows added
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index e3929d59159c1..d750b47ef864b 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext;
 using add_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const DenseTensor&,
-                            int,
                             DenseTensor*);
 
 using cast_kernel = void (*)(const DeviceContext&,
@@ -46,7 +45,6 @@ using concat_kernel = void (*)(const DeviceContext&,
 using divide_kernel = void (*)(const DeviceContext&,
                                const DenseTensor&,
                                const DenseTensor&,
-                               int,
                                DenseTensor*);
 
 using dot_kernel = void (*)(const DeviceContext&,
@@ -82,13 +80,11 @@ using mean_kernel = void (*)(const DeviceContext&,
                              const DenseTensor&,
                              const std::vector<int64_t>&,
                              bool,
-                             bool,
                              DenseTensor*);
 
 using multiply_kernel = void (*)(const DeviceContext&,
                                  const DenseTensor&,
                                  const DenseTensor&,
-                                 int,
                                  DenseTensor*);
 
 using reshape_kernel = void (*)(const DeviceContext&,
@@ -107,14 +103,12 @@ using sum_kernel = void (*)(const DeviceContext&,
                             const DenseTensor&,
                             const std::vector<int64_t>&,
                             bool,
-                            bool,
                             DataType,
                             DenseTensor*);
 
 using subtract_kernel = void (*)(const DeviceContext&,
                                  const DenseTensor&,
                                  const DenseTensor&,
-                                 int,
                                  DenseTensor*);
 
 using conj_kernel = void (*)(const DeviceContext&,
diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h
index 5c86787966368..8e089970f9139 100644
--- a/paddle/pten/core/kernel_alias_name.h
+++ b/paddle/pten/core/kernel_alias_name.h
@@ -20,10 +20,10 @@ namespace pten {
 // the key is kernel_name in fluid, the value is the kernel_name in pten
 // the key is sorted by key's alphabet
 const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
-    {"elementwise_add", "add"},
-    {"elementwise_div", "divide"},
-    {"elementwise_mul", "muliply"},
-    {"elementwise_sub", "subtract"},
+    {"elementwise_add", "add_raw"},
+    {"elementwise_div", "divide_raw"},
+    {"elementwise_mul", "muliply_raw"},
+    {"elementwise_sub", "subtract_raw"},
     {"fill_any_like", "full_like"},
     {"fill_constant", "full"},
     {"flatten_contiguous_range", "flatten"},
@@ -32,8 +32,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
     {"matmul_v2_grad", "matmul_grad"},
     {"matmul_v2_grad_grad", "matmul_double_grad"},
     {"matmul_v2_triple_grad", "matmul_triple_grad"},
-    {"reduce_mean", "mean"},
-    {"reduce_sum", "sum"},
+    {"reduce_mean", "mean_raw"},
+    {"reduce_sum", "sum_raw"},
     {"reshape2", "reshape"},
     {"reshape2_grad", "reshape_grad"},
     {"reshape2_grad_grad", "reshape_double_grad"},
diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc
index 7841dd4113cff..706a40936a393 100644
--- a/paddle/pten/kernels/cpu/math_kernel.cc
+++ b/paddle/pten/kernels/cpu/math_kernel.cc
@@ -32,11 +32,11 @@ namespace pten {
 
 #define DEFINE_CPU_ELEMENTWISE_OP(name)                                     \
   template <typename T, typename Context>                                   \
-  void name##Kernel(const Context& dev_ctx,                                 \
-                    const DenseTensor& x,                                   \
-                    const DenseTensor& y,                                   \
-                    int axis,                                               \
-                    DenseTensor* out) {                                     \
+  void name##RawKernel(const Context& dev_ctx,                              \
+                       const DenseTensor& x,                                \
+                       const DenseTensor& y,                                \
+                       int axis,                                            \
+                       DenseTensor* out) {                                  \
     out->mutable_data<T>();                                                 \
     if (x.dims() == y.dims()) {                                             \
       SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
@@ -55,23 +55,35 @@ namespace pten {
   }
 
 template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                bool reduce_all,
-                DenseTensor* out) {
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
   auto out_dtype = x.dtype();
   pten::Reduce<CPUContext, T, pten::funcs::MeanFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
 template <typename T, typename Context>
-void DivideKernel(const Context& dev_ctx,
+void SumRawKernel(const Context& dev_ctx,
                   const DenseTensor& x,
-                  const DenseTensor& y,
-                  int axis,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
                   DenseTensor* out) {
+  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out) {
   // allocate memory for out
   out->mutable_data<T>();
   if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
@@ -90,18 +102,6 @@ void DivideKernel(const Context& dev_ctx,
   }
 }
 
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               bool reduce_all,
-               DataType out_dtype,
-               DenseTensor* out) {
-  pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 // Create the definition of Add
 DEFINE_CPU_ELEMENTWISE_OP(Add)
 
@@ -118,42 +118,40 @@ using complex128 = ::paddle::platform::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
-PT_REGISTER_KERNEL(
-    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
-PT_REGISTER_KERNEL(add,
+PT_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
-                   pten::AddKernel,
+                   pten::AddRawKernel,
                    float,
                    double,
                    int,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PT_REGISTER_KERNEL(subtract_raw,
                    CPU,
                    ALL_LAYOUT,
-                   pten::SubtractKernel,
+                   pten::SubtractRawKernel,
                    float,
                    double,
                    int,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PT_REGISTER_KERNEL(divide_raw,
                    CPU,
                    ALL_LAYOUT,
-                   pten::DivideKernel,
+                   pten::DivideRawKernel,
                    float,
                    double,
                    int,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PT_REGISTER_KERNEL(multiply_raw,
                    CPU,
                    ALL_LAYOUT,
-                   pten::MultiplyKernel,
+                   pten::MultiplyRawKernel,
                    float,
                    double,
                    int,
@@ -161,10 +159,10 @@ PT_REGISTER_KERNEL(multiply,
                    bool,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum,
+PT_REGISTER_KERNEL(sum_raw,
                    CPU,
                    ALL_LAYOUT,
-                   pten::SumKernel,
+                   pten::SumRawKernel,
                    bool,
                    float,
                    double,
@@ -175,3 +173,5 @@ PT_REGISTER_KERNEL(sum,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
+PT_REGISTER_KERNEL(
+    mean_raw, CPU, ALL_LAYOUT, pten::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu
index d7a16ac49b1c9..6b6383f81065b 100644
--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -37,11 +37,11 @@ namespace pten {
 
 #define DEFINE_CUDA_ELEMENTWISE_OP(name)                             \
   template <typename T, typename Context>                            \
-  void name##Kernel(const Context& dev_ctx,                          \
-                    const DenseTensor& x,                            \
-                    const DenseTensor& y,                            \
-                    int axis,                                        \
-                    DenseTensor* out) {                              \
+  void name##RawKernel(const Context& dev_ctx,                       \
+                       const DenseTensor& x,                         \
+                       const DenseTensor& y,                         \
+                       int axis,                                     \
+                       DenseTensor* out) {                           \
     std::vector<const DenseTensor*> inputs;                          \
     std::vector<DenseTensor*> outputs;                               \
     inputs.emplace_back(&x);                                         \
@@ -57,17 +57,29 @@ namespace pten {
  */
 
 template <typename T, typename Context>
-void MeanKernel(const Context& dev_ctx,
-                const DenseTensor& x,
-                const std::vector<int64_t>& dims,
-                bool keep_dim,
-                bool reduce_all,
-                DenseTensor* out) {
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out) {
   auto out_dtype = x.dtype();
   pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
       dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
 }
 
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out) {
+  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+}
+
 // Create the definition of Add
 DEFINE_CUDA_ELEMENTWISE_OP(Add)
 // Create the definition of Subtract
@@ -77,30 +89,16 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
 // Create the definition of Divide
 DEFINE_CUDA_ELEMENTWISE_OP(Divide)
 
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               bool reduce_all,
-               DataType out_dtype,
-               DenseTensor* out) {
-  pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
-}
-
 }  // namespace pten
 
 using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
-PT_REGISTER_KERNEL(
-    mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
-PT_REGISTER_KERNEL(add,
+PT_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
-                   pten::AddKernel,
+                   pten::AddRawKernel,
                    float,
                    double,
                    int,
@@ -108,10 +106,10 @@ PT_REGISTER_KERNEL(add,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PT_REGISTER_KERNEL(subtract_raw,
                    GPU,
                    ALL_LAYOUT,
-                   pten::SubtractKernel,
+                   pten::SubtractRawKernel,
                    float,
                    double,
                    int,
@@ -119,10 +117,10 @@ PT_REGISTER_KERNEL(subtract,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PT_REGISTER_KERNEL(divide_raw,
                    GPU,
                    ALL_LAYOUT,
-                   pten::DivideKernel,
+                   pten::DivideRawKernel,
                    float,
                    double,
                    int,
@@ -130,10 +128,10 @@ PT_REGISTER_KERNEL(divide,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PT_REGISTER_KERNEL(multiply_raw,
                    GPU,
                    ALL_LAYOUT,
-                   pten::MultiplyKernel,
+                   pten::MultiplyRawKernel,
                    float,
                    double,
                    int,
@@ -142,10 +140,10 @@ PT_REGISTER_KERNEL(multiply,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum,
+PT_REGISTER_KERNEL(sum_raw,
                    GPU,
                    ALL_LAYOUT,
-                   pten::SumKernel,
+                   pten::SumRawKernel,
                    bool,
                    float,
                    double,
@@ -156,3 +154,12 @@ PT_REGISTER_KERNEL(sum,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
+
+PT_REGISTER_KERNEL(mean_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MeanRawKernel,
+                   float,
+                   double,
+                   bool,
+                   float16) {}
diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc
new file mode 100644
index 0000000000000..423282ab97ca4
--- /dev/null
+++ b/paddle/pten/kernels/math_kernel.cc
@@ -0,0 +1,212 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/pten/kernels/math_kernel.h"
+
+#include "paddle/pten/backends/all_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+namespace pten {
+
+template <typename T, typename Context>
+void MeanKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const std::vector<int64_t>& dims,
+                bool keep_dim,
+                DenseTensor* out) {
+  bool reduce_all = false;
+  MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
+}
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DataType out_dtype,
+               DenseTensor* out) {
+  bool reduce_all = false;
+  SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
+}
+
+template <typename T, typename Context>
+void AddKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  int axis = -1;
+  AddRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void SubtractKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void DivideKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* out) {
+  int axis = -1;
+  DivideRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+template <typename T, typename Context>
+void MultiplyKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  int axis = -1;
+  MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
+}
+
+}  // namespace pten
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_KERNEL(
+    mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
+
+PT_REGISTER_KERNEL(sum,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+
+PT_REGISTER_KERNEL(add,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   CPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   complex64,
+                   complex128) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_REGISTER_KERNEL(mean,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MeanKernel,
+                   float,
+                   double,
+                   bool,
+                   paddle::platform::float16) {}
+PT_REGISTER_KERNEL(sum,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SumKernel,
+                   bool,
+                   float,
+                   double,
+                   paddle::platform::float16,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+PT_REGISTER_KERNEL(add,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::AddKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(subtract,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::SubtractKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(divide,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::DivideKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+PT_REGISTER_KERNEL(multiply,
+                   GPU,
+                   ALL_LAYOUT,
+                   pten::MultiplyKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   complex64,
+                   complex128) {}
+#endif
diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h
index 65c0f84e696de..95379baaf3504 100644
--- a/paddle/pten/kernels/math_kernel.h
+++ b/paddle/pten/kernels/math_kernel.h
@@ -22,104 +22,127 @@ limitations under the License. */
 
 namespace pten {
 
+template <typename T, typename Context>
+void MeanRawKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int64_t>& dims,
+                   bool keep_dim,
+                   bool reduce_all,
+                   DenseTensor* out);
+
 template <typename T, typename Context>
 void MeanKernel(const Context& dev_ctx,
                 const DenseTensor& x,
                 const std::vector<int64_t>& dims,
                 bool keep_dim,
-                bool reduce_all,
                 DenseTensor* out);
 
+template <typename T, typename Context>
+void SumRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const std::vector<int64_t>& dims,
+                  bool keep_dim,
+                  bool reduce_all,
+                  DataType out_dtype,
+                  DenseTensor* out);
+
+template <typename T, typename Context>
+void SumKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int64_t>& dims,
+               bool keep_dim,
+               DataType out_dtype,
+               DenseTensor* out);
+
+template <typename T, typename Context>
+void AddRawKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  int axis,
+                  DenseTensor* out);
+
 template <typename T, typename Context>
 void AddKernel(const Context& dev_ctx,
                const DenseTensor& x,
                const DenseTensor& y,
-               int axis,
                DenseTensor* out);
 
+template <typename T, typename Context>
+void SubtractRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
 template <typename T, typename Context>
 void SubtractKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
-                    int axis,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void DivideRawKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis,
+                     DenseTensor* out);
+
 template <typename T, typename Context>
 void DivideKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const DenseTensor& y,
-                  int axis,
                   DenseTensor* out);
 
+template <typename T, typename Context>
+void MultiplyRawKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       int axis,
+                       DenseTensor* out);
+
 template <typename T, typename Context>
 void MultiplyKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     const DenseTensor& y,
-                    int axis,
                     DenseTensor* out);
 
-template <typename T, typename Context>
-void SumKernel(const Context& dev_ctx,
-               const DenseTensor& x,
-               const std::vector<int64_t>& dims,
-               bool keep_dim,
-               bool reduce_all,
-               DataType out_dtype,
-               DenseTensor* out);
-
 template <typename T, typename Context>
 DenseTensor Add(const Context& dev_ctx,
                 const DenseTensor& x,
-                const DenseTensor& y,
-                int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  AddKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
+                const DenseTensor& y) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
 template <typename T, typename Context>
 DenseTensor Subtract(const Context& dev_ctx,
                      const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  SubtractKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
+                     const DenseTensor& y) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
 template <typename T, typename Context>
 DenseTensor Divide(const Context& dev_ctx,
                    const DenseTensor& x,
-                   const DenseTensor& y,
-                   int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  DivideKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
+                   const DenseTensor& y) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
 template <typename T, typename Context>
 DenseTensor Multiply(const Context& dev_ctx,
                      const DenseTensor& x,
-                     const DenseTensor& y,
-                     int axis) {
-  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis);
-  pten::DenseTensor dense_out(
-      pten::make_intrusive<paddle::experimental::SharedStorage>(
-          dev_ctx.GetPlace()),
-      std::move(out_meta));
-  MultiplyKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
+                     const DenseTensor& y) {
+  auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
+  auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
+  MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
   return dense_out;
 }
 
@@ -130,8 +153,7 @@ DenseTensor Mean(const Context& dev_ctx,
                  bool keep_dim) {
   auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
-  bool reduce_all = false;
-  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
+  MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
   return dense_out;
 }
 
@@ -144,12 +166,7 @@ DenseTensor Sum(const Context& dev_ctx,
   auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
   auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
 
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  SumKernel<T, Context>(
-      dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
+  SumKernel<T, Context>(dev_ctx, x, axis, keep_dim, dtype, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index 0bc16371c0731..e5d9b05eec7b3 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -54,11 +54,10 @@ TEST(DEV_API, add) {
   for (size_t i = 0; i < 10; ++i) {
     dense_y_data[i] = i * 2.0;
   }
-  int axis = 1;
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
@@ -101,11 +100,10 @@ TEST(DEV_API, subtract) {
   for (size_t i = 0; i < 10; ++i) {
     dense_y_data[i] = i * 2.0;
   }
-  int axis = 1;
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
@@ -148,11 +146,10 @@ TEST(DEV_API, divide) {
   for (size_t i = 0; i < 10; ++i) {
     dense_y_data[i] = i * 2.0 + 1;
   }
-  int axis = 1;
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
@@ -195,11 +192,10 @@ TEST(DEV_API, multiply) {
   for (size_t i = 0; i < 10; ++i) {
     dense_y_data[i] = i * 2.0;
   }
-  int axis = 1;
 
   // 2. test API
   pten::CPUContext dev_ctx;
-  auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y, axis);
+  auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
   ASSERT_EQ(dense_out.dims().size(), 2);
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 1bf5344e83746..a0d7ce84f75fd 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -6,7 +6,6 @@
     param : [x, y, -1]
   kernel :
     func : add
-    param : [x, y, -1]
 
 - api : cast
   args : (const Tensor& x, DataType out_dtype)
@@ -44,7 +43,6 @@
     param : [x, y, -1]
   kernel :
     func : divide
-    param : [x, y, -1]
 
 - api : dot
   args : (const Tensor& x, const Tensor& y)
@@ -130,7 +128,6 @@
     param: [x, axis, keep_dim]
   kernel : 
     func : mean
-    param : [x, axis, keep_dim, false]
 
 - api : multiply
   args : (const Tensor& x, const Tensor& y)
@@ -140,7 +137,6 @@
     param : [x, y, -1]
   kernel :
     func : multiply
-    param : [x, y, -1]
 
 - api : ones_like
   args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
@@ -172,7 +168,6 @@
     param : [x, y, -1]
   kernel :
     func : subtract
-    param : [x, y, -1]
 
 - api : sum
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
@@ -182,7 +177,7 @@
     param: [x, axis, keep_dim, dtype]
   kernel : 
     func : sum
-    param : [x, axis, keep_dim, false, DataType::UNDEFINED]
+    param : [x, axis, keep_dim, dtype]
     data_type : x
 
 - api : zeros_like

From a14dc68820dbb221831b13b8c43155f537e265e9 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Fri, 21 Jan 2022 20:56:04 +0800
Subject: [PATCH 15/15] [pten] fix test concat dev api build failed (#39117)

* fix test concat dev api build failed

* fix conflict

* fix conflict
---
 paddle/fluid/operators/concat_op.h               |  5 ++++-
 paddle/pten/kernels/cpu/concat_kernel.cc         |  2 +-
 paddle/pten/kernels/gpu/concat_kernel.cu         |  2 +-
 paddle/pten/tests/api/test_concat_api.cc         |  6 ++++--
 paddle/pten/tests/kernels/test_concat_dev_api.cc | 16 +++++++---------
 5 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index 3eaffbdc8bf35..1d9c10bdb8cc6 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -80,7 +80,10 @@ class ConcatKernel : public framework::OpKernel<T> {
       pt_ins.push_back(*in);
     }
 
-    pten::ConcatKernel<T>(dev_ctx, pt_ins, axis, out);
+    pten::ConcatKernel<T>(
+        static_cast<const typename paddle::framework::ConvertToPtenContext<
+            DeviceContext>::TYPE&>(dev_ctx),
+        pt_ins, axis, out);
   }
 };
 
diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc
index fb59c9c6005ff..c4aed7679bd72 100644
--- a/paddle/pten/kernels/cpu/concat_kernel.cc
+++ b/paddle/pten/kernels/cpu/concat_kernel.cc
@@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx,
 
   pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis);
   out->Resize(out_dims);
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
   if (axis == 0 && x[0].lod().size() > 0) {
diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu
index 6ddfef460fc6c..e52e3a3d6446c 100644
--- a/paddle/pten/kernels/gpu/concat_kernel.cu
+++ b/paddle/pten/kernels/gpu/concat_kernel.cu
@@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx,
 
   pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis);
   out->Resize(out_dims);
-  out->mutable_data<T>();
+  out->mutable_data<T>(dev_ctx.GetPlace());
 
   // If axis is 0, the lod of the output is not the same as inputs.
   if (axis == 0 && x[0].lod().size() > 0) {
diff --git a/paddle/pten/tests/api/test_concat_api.cc b/paddle/pten/tests/api/test_concat_api.cc
index e84aee0aaaf4f..c003e89f6c009 100644
--- a/paddle/pten/tests/api/test_concat_api.cc
+++ b/paddle/pten/tests/api/test_concat_api.cc
@@ -37,14 +37,16 @@ TEST(API, concat) {
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x->mutable_data<float>();
+  auto* dense_x_data =
+      dense_x->mutable_data<float>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
       alloc.get(),
       pten::DenseTensorMeta(pten::DataType::FLOAT32,
                             framework::make_ddim({3, 10}),
                             pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y->mutable_data<float>();
+  auto* dense_y_data =
+      dense_y->mutable_data<float>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 3; ++i) {
     for (size_t j = 0; j < 10; ++j) {
diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc
index c5d979ad908ff..6f9ea1b0d990a 100644
--- a/paddle/pten/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc
@@ -25,7 +25,7 @@ namespace pten {
 namespace tests {
 
 namespace framework = paddle::framework;
-using DDim = paddle::framework::DDim;
+using DDim = pten::framework::DDim;
 
 TEST(DEV_API, concat) {
   // 1. create tensor
@@ -35,13 +35,15 @@ TEST(DEV_API, concat) {
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_x_data = dense_x.mutable_data<float>();
+  auto* dense_x_data =
+      dense_x.mutable_data<float>(paddle::platform::CPUPlace());
 
   pten::DenseTensor dense_y(alloc.get(),
                             pten::DenseTensorMeta(pten::DataType::FLOAT32,
                                                   framework::make_ddim({3, 10}),
                                                   pten::DataLayout::NCHW));
-  auto* dense_y_data = dense_y.mutable_data<float>();
+  auto* dense_y_data =
+      dense_y.mutable_data<float>(paddle::platform::CPUPlace());
 
   for (size_t i = 0; i < 3; ++i) {
     for (size_t j = 0; j < 10; ++j) {
@@ -50,15 +52,11 @@ TEST(DEV_API, concat) {
     }
   }
 
-  paddle::platform::DeviceContextPool& pool =
-      paddle::platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
-
   std::vector<pten::DenseTensor> inputs = {dense_x, dense_y};
 
   // 2. test API
-  auto out = pten::Concat<float>(
-      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)), inputs, 0);
+  pten::CPUContext dev_ctx;
+  auto out = pten::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
   ASSERT_EQ(out.dims().size(), 2);