From e5cda6faa08b884e6bf5ae8b13da7691a364d5e6 Mon Sep 17 00:00:00 2001 From: Yulong Ao Date: Fri, 21 Jan 2022 11:56:32 +0800 Subject: [PATCH 01/15] [Auto Parallel] Use the new completion algorithm (#39086) * Add the backward support for QR * Remove unnecessary comments * [Auto Parallel] Improve the dist op interface and compatible computation * Remove unnecessary modification * Recover some modifications * Add lost files * Fix a minor bug * Fix the bug of the planner * Fix the format problem * [Auto Parallel] Update the completion algorithm * Fix the bug of auto_searcher unittest --- .../distributed/auto_parallel/__init__.py | 6 - .../distributed/auto_parallel/completion.py | 1414 +++++++---------- .../distributed/auto_parallel/dist_context.py | 34 +- .../distributed/auto_parallel/parallelizer.py | 15 +- .../test_auto_parallel_completion.py | 66 +- .../test_auto_parallel_completion_gpt.py | 22 +- .../test_auto_parallel_cost_model.py | 6 +- .../test_auto_parallel_dist_tensor.py | 6 +- .../unittests/test_auto_parallel_mapper.py | 12 +- .../test_auto_parallel_partitioner.py | 6 +- .../test_auto_parallel_partitioner_gpt.py | 11 +- .../unittests/test_auto_parallel_reshard.py | 7 +- .../test_auto_parallel_reshard_dpmppp.py | 6 +- .../test_auto_parallel_reshard_mppp.py | 11 +- .../unittests/test_auto_parallel_searcher.py | 4 +- 15 files changed, 686 insertions(+), 940 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py index 3b5ccaa062f6e..edcd53bdc7a52 100644 --- a/python/paddle/distributed/auto_parallel/__init__.py +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -15,12 +15,6 @@ from .interface import shard_tensor # noqa: F401 from .interface import shard_op # noqa: F401 from .process_mesh import ProcessMesh -# from .interface import set_shard_mask # noqa: F401 -# from .interface import set_offload_device # noqa: F401 -# from .interface import set_pipeline_stage # noqa: F401 -# from .interface import ProcessMesh # noqa: F401 -from .completion import complete_annotation # noqa: F401 -from .completion import complete_backward_annotation # noqa: F401 from .reshard import reshard # noqa: F401 from .cost_model import estimate_cost diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 660b1a54221a7..54491f9e6c16e 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy from copy import deepcopy +import time from paddle.fluid import core from paddle.fluid import framework -from .utils import compute_compatible_process_mesh -from .utils import compute_compatible_dim_mapping -from .utils import compute_compatible_dims_mapping from .utils import print_program_with_dist_attr from .operators import find_best_compatible_distributed_operator_impl from .dist_context import get_default_distributed_context @@ -29,865 +28,602 @@ from .dist_attribute import OperatorDistributedAttribute from paddle.distributed.fleet.meta_optimizers.common import OpRole -ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] +def compute_compatible_process_mesh(process_mesh_list): + """Compute the compatible process mesh given a list of process meshes.""" + if not process_mesh_list: + return None -def is_elementwise_like_op(op_type): - if op_type in ELEMENTWISE_LIKE_OP_LIST: - return True - else: - return False - + def _compute_compatible_process_mesh_two(pm1, pm2): + if pm1 is None: + return True, pm2 + if pm2 is None: + return True, pm1 + if pm1 == pm2: + return True, pm1 + if pm1.processes == pm2.processes: + if len(pm1.topology) >= len(pm2.topology): + return True, pm1 + else: + return True, pm2 + process_set1 = set(pm1.processes) + process_set2 = set(pm2.processes) + if process_set1.issubset(process_set2): + return True, pm2 + if process_set2.issubset(process_set1): + return True, pm1 + return False, None + + compatible_result = None + for process_mesh in process_mesh_list: + compatible, compatible_result = _compute_compatible_process_mesh_two( + compatible_result, process_mesh) + if not compatible: + return None + return copy.deepcopy(compatible_result) + + +def compute_compatible_dim_mapping(dim_mapping_list): + """Compute the compatible dim mapping given a list of dim mapping.""" + if not dim_mapping_list: + return None -def update_tensor_node_process_mesh(dist_context, tensor_node, fwd=True): - """ - Update tensor's process mesh by using its predecessor's process mesh if in the forward direction, - and by using its successor's process mesh if in the backward direction. Note: only the equal - process meshes are compatible for now. + def _compute_compatible_dim_mapping_two(dm1, dm2): + if dm1 == -1: + return True, dm2 + if dm2 == -1: + return True, dm1 + if dm1 == dm2: + return True, dm1 + return False, None + + compatible_result = -1 + for mapping in dim_mapping_list: + compatible, compatible_result = _compute_compatible_dim_mapping_two( + compatible_result, mapping) + if not compatible: + return None + return compatible_result + + +def compute_compatible_dims_mapping(dims_mapping_list): + """Compute the compatible dims mapping given a list of dims mapping. + Each of dims mapping is also a list. """ - changed = False - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node) - if tensor_dist_attr.is_annotated("process_mesh"): - return changed - tensor_process_mesh = tensor_dist_attr.process_mesh - if fwd: - inputs_process_meshes = [] - for pred_op_node in tensor_node.inputs: - if pred_op_node.op() is not None: - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - pred_op_node) - op_process_mesh = op_dist_attr.process_mesh - inputs_process_meshes.append(op_process_mesh) - compatible_process_mesh = compute_compatible_process_mesh( - inputs_process_meshes) - if compatible_process_mesh is not None and tensor_process_mesh is None: - tensor_dist_attr.process_mesh = compatible_process_mesh - changed = True - else: - outputs_process_meshes = [] - for succ_op_node in tensor_node.outputs: - if succ_op_node.op() is not None: - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - succ_op_node) - op_process_mesh = op_dist_attr.process_mesh - outputs_process_meshes.append(op_process_mesh) - compatible_process_mesh = compute_compatible_process_mesh( - outputs_process_meshes) - if compatible_process_mesh is not None and tensor_process_mesh is None: - tensor_dist_attr.process_mesh = compatible_process_mesh - changed = True - return changed - - -def update_op_node_process_mesh(dist_context, op_node, fwd=True): - """ - Update op's process mesh by using its predecessor's process mesh if in the forward direction, - and by using its successor's process mesh if in the backward direction. Note: only the equal - process meshes are compatible for now. - """ - changed = False - op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node) - if op_dist_attr.is_annotated("process_mesh"): - return changed - op_process_mesh = op_dist_attr.process_mesh - if fwd: - inputs_process_meshes = [] - for tensor_node in op_node.inputs: - if tensor_node.var() is not None: - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - tensor_process_mesh = tensor_dist_attr.process_mesh - inputs_process_meshes.append(tensor_process_mesh) - compatible_process_mesh = compute_compatible_process_mesh( - inputs_process_meshes) - if compatible_process_mesh is not None and op_process_mesh is None: - op_dist_attr.process_mesh = compatible_process_mesh - changed = True - else: - outputs_process_meshes = [] - for tensor_node in op_node.outputs: - if tensor_node.var() is not None: - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - tensor_process_mesh = tensor_dist_attr.process_mesh - outputs_process_meshes.append(tensor_process_mesh) - compatible_process_mesh = compute_compatible_process_mesh( - outputs_process_meshes) - if compatible_process_mesh is not None and op_process_mesh is None: - op_dist_attr.process_mesh = compatible_process_mesh - changed = True - return changed - - -def update_op_dims_mapping_by_default_dist_impl(dist_context, op_node): - """Each operator has a default distributed operator, only allowed to be sharded in batch dimension.""" - changed = False - if (not op_node.is_op()) or (op_node.op() is None): - return False - op_desc = op_node.op() - dist_op = dist_context.get_dist_op_for_graph(op_node) - op_dist_attr = dist_op.dist_attr - # The following statement will be replaced by a more elegent way - if op_desc.type() == "shape" or op_desc.type() == "slice": - return False - output_names = op_desc.output_names() - xshape_arg_names = [] - if "XShape" in output_names: - xshape_arg_names = op_desc.output("XShape") - batch_dim_mappings = [] - for arg_name in op_desc.input_arg_names(): - serial_tensor = dist_op.get_serial_input(arg_name) - if serial_tensor.is_parameter: - continue - dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if len(dims_mapping) > 1: - for idx, mapping in enumerate(dims_mapping[1:]): - assert mapping == -1, \ - "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\ - .format(op_desc.type(), idx, mapping) - batch_dim_mappings.append(dims_mapping[0]) - for arg_name in op_desc.output_arg_names(): - serial_tensor = dist_op.get_serial_output(arg_name) - if serial_tensor.is_parameter: - continue - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - if arg_name not in xshape_arg_names: - if len(dims_mapping) > 1: - for idx, mapping in enumerate(dims_mapping[1:]): - assert mapping == -1, \ - "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\ - .format(op_desc.type(), idx, mapping) - batch_dim_mappings.append(dims_mapping[0]) - else: - assert dims_mapping[0] == -1, \ - "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part."\ - .format(op_desc.type(), mapping) - if len(dims_mapping) > 2: - for idx, mapping in enumerate(dims_mapping[2:]): - assert mapping == -1, \ - "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part."\ - .format(op_desc.type(), idx, mapping) - batch_dim_mappings.append(dims_mapping[1]) - - compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings) - assert compatible_dim_mapping is not None, "There is no compatible dim mapping." - for arg_name in op_desc.input_arg_names(): - serial_tensor = dist_op.get_serial_input(arg_name) - if serial_tensor.is_parameter: - continue - dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if compatible_dim_mapping != dims_mapping[0]: - dims_mapping[0] = compatible_dim_mapping - changed = True - for arg_name in op_desc.output_arg_names(): - serial_tensor = dist_op.get_serial_output(arg_name) - if serial_tensor.is_parameter: - continue - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - if arg_name not in xshape_arg_names: - if compatible_dim_mapping != dims_mapping[0]: - dims_mapping[0] = compatible_dim_mapping + if not dims_mapping_list: + return None + length = len(dims_mapping_list[0]) + for dims_mapping in dims_mapping_list: + if dims_mapping is None: + return None + if len(dims_mapping) != length: + return None + compatible_result = [] + for dim_mappings in zip(*dims_mapping_list): + compatible_dim_mapping = compute_compatible_dim_mapping( + list(dim_mappings)) + if compatible_dim_mapping is None: + return None + compatible_result.append(compatible_dim_mapping) + return compatible_result + + +class Completer: + def __init__(self, dist_context): + assert dist_context is not None + self._dist_context = dist_context + + def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True): + changed = False + if (not tensor_node.is_var()) or (tensor_node.var() is None): + return False + tensor_desc = tensor_node.var() + # Skip reader tensor + if tensor_desc.type() == core.VarDesc.VarType.READER: + return False + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + assert tensor_dist_attr is not None + if tensor_dist_attr.is_annotated("dims_mapping"): + return False + tensor_dims_mapping = tensor_dist_attr.dims_mapping + if fwd: + dims_mapping_list = [] + for pred_op_node in tensor_node.inputs: + if pred_op_node.op() is not None: + if pred_op_node.op().type() == "create_py_reader" \ + or pred_op_node.op().type() == "create_double_buffer_reader" \ + or pred_op_node.op().type() == "read": + continue + op_dist_attr = self._dist_context.get_op_dist_attr_for_graph( + pred_op_node) + if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh: + op_dims_mapping = op_dist_attr.get_output_dims_mapping( + tensor_desc.name()) + dims_mapping_list.append(op_dims_mapping) + dims_mapping_list.append(tensor_dims_mapping) + compatible_dims_mapping = compute_compatible_dims_mapping( + dims_mapping_list) + if (compatible_dims_mapping is not None) and \ + (compatible_dims_mapping != tensor_dims_mapping): + tensor_dist_attr.dims_mapping = compatible_dims_mapping changed = True else: - if compatible_dim_mapping != dims_mapping[1]: - dims_mapping[1] = compatible_dim_mapping + dims_mapping_list = [] + for succ_op_node in tensor_node.outputs: + if succ_op_node.op() is not None: + if succ_op_node.op().type() == "create_py_reader" \ + or succ_op_node.op().type() == "create_double_buffer_reader" \ + or succ_op_node.op().type() == "read": + continue + op_dist_attr = self._dist_context.get_op_dist_attr_for_graph( + succ_op_node) + if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh: + op_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_desc.name()) + dims_mapping_list.append(op_dims_mapping) + dims_mapping_list.append(tensor_dims_mapping) + compatible_dims_mapping = compute_compatible_dims_mapping( + dims_mapping_list) + if (compatible_dims_mapping is not None) and \ + (compatible_dims_mapping != tensor_dims_mapping): + tensor_dist_attr.dims_mapping = compatible_dims_mapping changed = True + return changed - return changed - - -def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_context, op_node): - """Element-wise operator can be sharded in any way (but should take care of broadcasting).""" - changed = False - if (not op_node.is_op()) or (op_node.op() is None): - return False - op_desc = op_node.op() - op_dist_attr = dist_context.get_op_dist_attr_for_graph(op_node) - - input_arg_names = op_desc.input_arg_names() - input_dims_mapping_dict = {} - input_dims_mapping_lens = {} - max_dims_mapping_len = -1 - for arg_name in input_arg_names: - dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name) - if max_dims_mapping_len < len(dims_mapping): - max_dims_mapping_len = len(dims_mapping) - input_dims_mapping_dict[arg_name] = dims_mapping - input_dims_mapping_lens[arg_name] = len(dims_mapping) - - dims_mapping_list = [] - for arg_name in input_arg_names: - if input_dims_mapping_lens[arg_name] < max_dims_mapping_len: - new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)] - for i in range(input_dims_mapping_lens[arg_name]): - new_idx = (max_dims_mapping_len - - input_dims_mapping_lens[arg_name]) + i - new_dims_mapping[new_idx] = input_dims_mapping_dict[arg_name][i] - dims_mapping_list.append(new_dims_mapping) - else: - dims_mapping_list.append(input_dims_mapping_dict[arg_name]) - output_arg_names = op_desc.output_arg_names() - for arg_name in output_arg_names: - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - assert len(dims_mapping) == max_dims_mapping_len - dims_mapping_list.append(dims_mapping) - - compatible_dims_mapping = compute_compatible_dims_mapping(dims_mapping_list) - assert compatible_dims_mapping is not None, "There is no compatible dim mapping." - - for arg_name in input_arg_names: - if input_dims_mapping_lens[arg_name] < max_dims_mapping_len: - new_dims_mapping = [ - -1 for _ in range(input_dims_mapping_lens[arg_name]) - ] - for i in range(input_dims_mapping_lens[arg_name]): - new_idx = (max_dims_mapping_len - - input_dims_mapping_lens[arg_name]) + i - new_dims_mapping[i] = compatible_dims_mapping[new_idx] - if new_dims_mapping != input_dims_mapping_dict[arg_name]: - op_dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping) + def _update_op_node_dims_mapping(self, op_node, fwd=True): + changed = False + if (not op_node.is_op()) or (op_node.op() is None): + return False + # Skip reader op + op_desc = op_node.op() + if op_desc.type() == "create_py_reader" \ + or op_desc.type() == "create_double_buffer_reader" \ + or op_desc.type() == "read": + return False + dist_op = self._dist_context.get_dist_op_for_graph(op_node) + op_dist_attr = dist_op.dist_attr + if fwd: + for tensor_node in op_node.inputs: + if tensor_node.var() is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER: + continue + tensor_desc = tensor_node.var() + if op_dist_attr.is_annotated_input_dims_mapping( + tensor_desc.name()): + continue + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh: + tensor_dims_mapping = tensor_dist_attr.dims_mapping + op_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_desc.name()) + compatible_dims_mapping = compute_compatible_dims_mapping( + [op_dims_mapping, tensor_dims_mapping]) + if (compatible_dims_mapping is not None) and \ + (compatible_dims_mapping != op_dims_mapping): + op_dist_attr.set_input_dims_mapping( + tensor_desc.name(), compatible_dims_mapping) + changed = True + # Find the most compatible implemenetations from the distributed operator + op_dist_impl = find_best_compatible_distributed_operator_impl( + dist_op, fwd=True) + assert op_dist_impl is not None, "Cannot find the dist op implementation." + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx else: - if compatible_dims_mapping != input_dims_mapping_dict[arg_name]: - op_dist_attr.set_input_dims_mapping(arg_name, - compatible_dims_mapping) + for tensor_node in op_node.outputs: + if tensor_node.var() is not None: + if tensor_node.var().type() == core.VarDesc.VarType.READER: + continue + tensor_desc = tensor_node.var() + if op_dist_attr.is_annotated_output_dims_mapping( + tensor_desc.name()): + continue + tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph( + tensor_node) + if op_dist_attr.process_mesh == tensor_dist_attr.process_mesh: + tensor_dims_mapping = tensor_dist_attr.dims_mapping + op_dims_mapping = op_dist_attr.get_output_dims_mapping( + tensor_desc.name()) + compatible_dims_mapping = compute_compatible_dims_mapping( + [op_dims_mapping, tensor_dims_mapping]) + if (compatible_dims_mapping is not None) and \ + (compatible_dims_mapping != op_dims_mapping): + op_dist_attr.set_output_dims_mapping( + tensor_desc.name(), compatible_dims_mapping) + changed = True + # Find the most compatible implemenetations from the distributed operator + op_dist_impl = find_best_compatible_distributed_operator_impl( + dist_op, fwd=False) + assert op_dist_impl is not None, "Cannot find the dist op implementation." + dim_changed = op_dist_impl.update_dims_mapping(dist_op) + if dim_changed: changed = True + if op_dist_impl.is_auto_compatible(dist_op): + if op_dist_impl.type == "elementwise": + op_dist_attr.impl_type = "default" + else: + op_dist_attr.impl_type = op_dist_impl.type + op_dist_attr.impl_idx = op_dist_impl.idx + return changed - for arg_name in output_arg_names: - dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name) - if compatible_dims_mapping != dims_mapping: - op_dist_attr.set_output_dims_mapping(arg_name, - compatible_dims_mapping) - changed = True - - return changed - - -def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True): - changed = False - if (not tensor_node.is_var()) or (tensor_node.var() is None): - return False - tensor_desc = tensor_node.var() - # Skip reader tensor - if tensor_desc.type() == core.VarDesc.VarType.READER: - return False - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph(tensor_node) - assert tensor_dist_attr is not None - if tensor_dist_attr.is_annotated("dims_mapping"): - return False - tensor_dims_mapping = tensor_dist_attr.dims_mapping - if fwd: - dims_mapping_list = [] - for pred_op_node in tensor_node.inputs: - if pred_op_node.op() is not None: - if pred_op_node.op().type() == "create_py_reader" \ - or pred_op_node.op().type() == "create_double_buffer_reader" \ - or pred_op_node.op().type() == "read": - continue - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - pred_op_node) - op_dims_mapping = op_dist_attr.get_output_dims_mapping( - tensor_desc.name()) - dims_mapping_list.append(op_dims_mapping) - dims_mapping_list.append(tensor_dims_mapping) - compatible_dims_mapping = compute_compatible_dims_mapping( - dims_mapping_list) - if (compatible_dims_mapping is not None) and \ - (compatible_dims_mapping != tensor_dims_mapping): - tensor_dist_attr.dims_mapping = compatible_dims_mapping - changed = True - else: - dims_mapping_list = [] - for succ_op_node in tensor_node.outputs: - if succ_op_node.op() is not None: - if succ_op_node.op().type() == "create_py_reader" \ - or succ_op_node.op().type() == "create_double_buffer_reader" \ - or succ_op_node.op().type() == "read": - continue - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - succ_op_node) - op_dims_mapping = op_dist_attr.get_input_dims_mapping( - tensor_desc.name()) - dims_mapping_list.append(op_dims_mapping) - dims_mapping_list.append(tensor_dims_mapping) - compatible_dims_mapping = compute_compatible_dims_mapping( - dims_mapping_list) - if (compatible_dims_mapping is not None) and \ - (compatible_dims_mapping != tensor_dims_mapping): - tensor_dist_attr.dims_mapping = compatible_dims_mapping - changed = True - return changed - - -def update_op_node_dims_mapping(dist_context, op_node, fwd=True): - changed = False - if (not op_node.is_op()) or (op_node.op() is None): - return False - # Skip reader op - op_desc = op_node.op() - if op_desc.type() == "create_py_reader" \ - or op_desc.type() == "create_double_buffer_reader" \ - or op_desc.type() == "read": - return False - dist_op = dist_context.get_dist_op_for_graph(op_node) - op_dist_attr = dist_op.dist_attr - if fwd: - for tensor_node in op_node.inputs: - if tensor_node.var() is not None: - if tensor_node.var().type() == core.VarDesc.VarType.READER: - continue - tensor_desc = tensor_node.var() - if op_dist_attr.is_annotated_input_dims_mapping( - tensor_desc.name()): - continue - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - tensor_dims_mapping = tensor_dist_attr.dims_mapping - op_dims_mapping = op_dist_attr.get_input_dims_mapping( - tensor_desc.name()) - compatible_dims_mapping = compute_compatible_dims_mapping( - [op_dims_mapping, tensor_dims_mapping]) - if (compatible_dims_mapping is not None) and \ - (compatible_dims_mapping != op_dims_mapping): - op_dist_attr.set_input_dims_mapping(tensor_desc.name(), - compatible_dims_mapping) - changed = True - # Find the most compatible implemenetations from the distributed operator - op_dist_impl = find_best_compatible_distributed_operator_impl( - dist_op, fwd=True) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" - else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx - else: - for tensor_node in op_node.outputs: - if tensor_node.var() is not None: - if tensor_node.var().type() == core.VarDesc.VarType.READER: - continue - tensor_desc = tensor_node.var() - if op_dist_attr.is_annotated_output_dims_mapping( - tensor_desc.name()): - continue - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - tensor_dims_mapping = tensor_dist_attr.dims_mapping - op_dims_mapping = op_dist_attr.get_output_dims_mapping( - tensor_desc.name()) - compatible_dims_mapping = compute_compatible_dims_mapping( - [op_dims_mapping, tensor_dims_mapping]) - if (compatible_dims_mapping is not None) and \ - (compatible_dims_mapping != op_dims_mapping): - op_dist_attr.set_output_dims_mapping( - tensor_desc.name(), compatible_dims_mapping) - changed = True - # Find the most compatible implemenetations from the distributed operator - op_dist_impl = find_best_compatible_distributed_operator_impl( - dist_op, fwd=False) - assert op_dist_impl is not None, "Cannot find the dist op implementation." - dim_changed = op_dist_impl.update_dims_mapping(dist_op) - if dim_changed: - changed = True - if op_dist_impl.is_auto_compatible(dist_op): - if op_dist_impl.type == "elementwise": - op_dist_attr.impl_type = "default" + def _update_process_mesh(self): + def _find_nearset_node(nodes, idx): + for node in reversed(nodes[:idx]): + node_dist_attr = self._dist_context.get_dist_attr_for_graph( + node) + if node_dist_attr.process_mesh is not None: + return node + + total_reach_fix_point = False + while not total_reach_fix_point: + total_changed = False + for is_fwd in [True, False]: + all_nodes = self._dist_context.serial_ordered_nodes \ + if is_fwd else reversed(self._dist_context.serial_ordered_nodes) + reach_fix_point = False + while not reach_fix_point: + changed = False + for idx, node in enumerate(all_nodes): + nearest_node = _find_nearset_node( + self._dist_context.serial_ordered_nodes, idx) + if nearest_node is None: + continue + nearest_node_dis_attr = self._dist_context.get_dist_attr_for_graph( + nearest_node) + nearest_process_mesh = nearest_node_dis_attr.process_mesh + cur_node_dist_attr = self._dist_context.get_dist_attr_for_graph( + node) + cur_process_mesh = cur_node_dist_attr.process_mesh + compatible_process_mesh = compute_compatible_process_mesh( + [cur_process_mesh, nearest_process_mesh]) + if compatible_process_mesh is not None \ + and cur_process_mesh != compatible_process_mesh: + cur_node_dist_attr.process_mesh = compatible_process_mesh + changed = True + if changed: + reach_fix_point = False + total_changed = True + else: + reach_fix_point = True + if total_changed: + total_reach_fix_point = False else: - op_dist_attr.impl_type = op_dist_impl.type - op_dist_attr.impl_idx = op_dist_impl.idx - return changed - - -def complete_annotation(program, dist_context=None): - """ Complete annotation for the partial annotated program. - - Arguments: - program: partial annotated program. - dist_context: the distributed context is used to store distributed attributes for program. - If not provided, the default one will be used. - Returns: - program: completed annotated program. - """ - - # Use the default distribted context for completeion if there is no one - if dist_context is None: - dist_context = get_default_distributed_context() - dist_context.serial_program = program - else: - dist_context.serial_program = program - - # print_program_with_dist_attr(program, dist_context) - - # Initialize distributed attributes for all var and op node in program - dist_context.init_dist_attr_for_program() - - # Initialize distributed attributes for all var and op node in graph - dist_context.init_dist_attr_for_graph() - - # Complete process mesh for each node - all_nodes = list(dist_context.serial_graph.all_nodes()) + total_reach_fix_point = True - def sort_key_fun(node): - first = -1 - if node.is_op(): - first = 0 - else: - first = 1 - second = -1 - if node.is_op() and node.op() is not None: - second = node.op().id() - if node.is_var() and node.var() is not None: - second = node.var().id() - return (first, second) - - all_nodes.sort(key=sort_key_fun) - - reach_fix_point = False - while not reach_fix_point: - total_changed = False - reach_fwd_fix_point = False - reach_bwd_fix_point = False - while not reach_fwd_fix_point: + def _update_dims_mapping(self): + # Complete dims_mapping for each node + reach_fix_point = False + while not reach_fix_point: changed = False - for node in all_nodes: - if node.is_var() and node.var() is not None: - tensor_changed = update_tensor_node_process_mesh( - dist_context, node, fwd=True) - if tensor_changed: - changed = True - if node.is_op() and node.op() is not None: - op_changed = update_op_node_process_mesh( - dist_context, node, fwd=True) - if op_changed: - changed = True + for is_fwd in [True, False]: + all_nodes = self._dist_context.serial_ordered_nodes \ + if is_fwd else reversed(self._dist_context.serial_ordered_nodes) + for node in all_nodes: + if node.is_var() and node.var() is not None: + tensor_changed = self._update_tensor_node_dims_mapping( + node, fwd=is_fwd) + if tensor_changed: + changed = True + if node.is_op() and node.op() is not None: + op_changed = self._update_op_node_dims_mapping( + node, fwd=is_fwd) + if op_changed: + changed = True if changed: - reach_fwd_fix_point = False - total_changed = True + reach_fix_point = False else: - reach_fwd_fix_point = True - while not reach_bwd_fix_point: - changed = False - for node in all_nodes: - if node.is_var() and node.var() is not None: - tensor_changed = update_tensor_node_process_mesh( - dist_context, node, fwd=False) - if tensor_changed: - changed = True - if node.is_op() and node.op() is not None: - op_changed = update_op_node_process_mesh( - dist_context, node, fwd=False) - if op_changed: - changed = True - if changed: - reach_bwd_fix_point = False - total_changed = True - else: - reach_bwd_fix_point = True - if total_changed: - reach_fix_point = False - else: - reach_fix_point = True - # Validation the completion of process meshes and should be moved to a proper location - is_wrong = False - for node in all_nodes: - if node.is_var() and node.var() is not None: - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - node) - if tensor_dist_attr.process_mesh is None: - msg_str = "" - for op_node in node.inputs: - if op_node.op() is not None: - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - op_node) - msg_str += "{} [{}], ".format( - op_node.op().type(), - op_dist_attr.process_mesh) - else: - msg_str += "{} [{}], ".format(op_node.name(), - None) - for op_node in node.outputs: - if op_node.op() is not None: - op_dist_attr = dist_context.get_op_dist_attr_for_graph( - op_node) - msg_str += "{} [{}], ".format( - op_node.op().type(), - op_dist_attr.process_mesh) - else: - msg_str += "{} [{}], ".format(op_node.name(), - None) - msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_tensor api explicitly to annotate it".format( - node.var().name(), msg_str[:-2]) - is_wrong = True - print(msg_str) - if node.is_op() and node.op() is not None: - op_dist_attr = dist_context.get_op_dist_attr_for_graph(node) - if op_dist_attr.process_mesh is None: - msg_str = "" - for tensor_node in node.inputs: - if tensor_node.var() is not None: - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - msg_str += "{} [{}], ".format( - tensor_node.var().name(), - tensor_dist_attr.process_mesh) - else: - msg_str += "{} [{}], ".format( - tensor_node.name(), None) - for tensor_node in node.outputs: - if tensor_node.var() is not None: - tensor_dist_attr = dist_context.get_tensor_dist_attr_for_graph( - tensor_node) - msg_str += "{} [{}], ".format( - tensor_node.var().name(), - tensor_dist_attr.process_mesh) - else: - msg_str += "{} [{}], ".format( - tensor_node.name(), None) - msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_op api explicitly to annotate it".format( - node.op().type(), msg_str[:-2]) - is_wrong = True - print(msg_str) - if node.is_op() and node.op() is None: - print("op op is None", node.name()) - if is_wrong: - assert False, "Cannot complete process_meshes of the program." - - # Complete dims_mapping for each node - reach_fix_point = False - while not reach_fix_point: - changed = False - for node in all_nodes: - if node.is_var() and node.var() is not None: - tensor_changed = update_tensor_node_dims_mapping( - dist_context, node, fwd=True) - if tensor_changed: - changed = True - if node.is_op() and node.op() is not None: - op_changed = update_op_node_dims_mapping( - dist_context, node, fwd=True) - if op_changed: - changed = True - for node in reversed(all_nodes): - if node.is_var() and node.var() is not None: - tensor_changed = update_tensor_node_dims_mapping( - dist_context, node, fwd=False) - if tensor_changed: - changed = True - if node.is_op() and node.op() is not None: - op_changed = update_op_node_dims_mapping( - dist_context, node, fwd=False) - if op_changed: - changed = True - if changed: - reach_fix_point = False - else: - reach_fix_point = True - - # Copy the corresponding distributed attribute from graph to program - dist_context.copy_dist_attr_from_graph_to_program() - dist_context.clear_dist_info_for_graph() - - # Do the validation check and amend some completion - dist_context.amend_dist_attr_for_program() - - # print_program_with_dist_attr(program, dist_context) - dist_context.validate_dist_attr_for_program() + reach_fix_point = True + + def complete_forward_annotation(self, serial_main_program): + """ Complete annotation for the partial annotated serial_main_program. + + Arguments: + serial_main_program: partial annotated serial_main_program. + + Returns: + serial_main_program: completed annotated serial_main_program. + """ + + # Use the default distribted context for completeion if there is no one + self._dist_context.serial_program = serial_main_program + + # Initialize distributed attributes for all var and op node in serial_main_program + self._dist_context.init_dist_attr_for_program() + + # Initialize distributed attributes for all var and op node in graph + self._dist_context.init_dist_attr_for_graph() + + self._update_process_mesh() + + # Complete dims_mapping for each node + self._update_dims_mapping() + + # Copy the corresponding distributed attribute from graph to serial_main_program + self._dist_context.copy_dist_attr_from_graph_to_program() + self._dist_context.clear_dist_info_for_graph() + + # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) + # Do the validation check and amend some completion + self._dist_context.amend_dist_attr_for_program() + + # print_serial_main_program_with_dist_attr(serial_main_program, self._dist_context) + self._dist_context.validate_dist_attr_for_program() + + return serial_main_program + + def complete_backward_annotation(self, serial_main_program): + """Complete the annotation of vars and ops in the backward phase for parallel program.""" + + def _is_grad_var_name(name): + if "@GRAD" in name: + return True + return False + + def _get_forward_varname_from_grad_varname(grad_var_name): + assert _is_grad_var_name( + grad_var_name), "[{}] is not a grad varnme.".format( + grad_var_name) + return grad_var_name[:grad_var_name.find("@GRAD")] + + def _get_op_by_id(ops, id): + for op in ops: + if op.desc.id() == id: + return op + return None + + first_backward_op_idx = -1 + for idx, op in enumerate(serial_main_program.global_block().ops): + if int(op.attr('op_role')) == int( + int(core.op_proto_and_checker_maker.OpRole.Backward) | int( + core.op_proto_and_checker_maker.OpRole.Loss)): + assert op.type == "fill_constant" + first_backward_op_idx = idx + break + + assert first_backward_op_idx >= 0, "No backward procedure found in this program." + + ops = list(serial_main_program.global_block().ops) + vars = serial_main_program.global_block().vars + dist_op_context = self._dist_context.dist_op_context + + for idx in range(first_backward_op_idx, len(ops)): + + # complete the initial grad loss op + if idx == first_backward_op_idx: + assert ops[idx].type == "fill_constant" + assert len( + ops[idx].input_arg_names + ) == 0, "first backward op should has only ONE output, but got [{}]".format( + len(ops[idx].input_arg_names)) + assert len( + ops[idx].output_arg_names + ) == 1, "first backward op should has only ONE output, but got [{}]".format( + len(ops[idx].output_arg_names)) + + grad_var = vars[ops[idx].output_arg_names[0]] + forward_var_name = _get_forward_varname_from_grad_varname( + grad_var.name) + forward_var = vars[forward_var_name] + + # TODO complete other attribte for grad var + tensor_dist_attr = TensorDistributedAttribute() + process_mesh = self._dist_context.get_tensor_dist_attr_for_program( + forward_var).process_mesh + dims_mapping = self._dist_context.get_tensor_dist_attr_for_program( + forward_var).dims_mapping + tensor_dist_attr.dims_mapping = dims_mapping + tensor_dist_attr.process_mesh = process_mesh + self._dist_context.set_tensor_dist_attr_for_program( + grad_var, tensor_dist_attr) - return program - - -def complete_backward_annotation(auto_parallel_main_prog, dist_context=None): - """Complete the annotation of vars and ops in the backward phase for parallel program.""" - - def _is_grad_var_name(name): - if "@GRAD" in name: - return True - return False - - def _get_forward_varname_from_grad_varname(grad_var_name): - assert _is_grad_var_name( - grad_var_name), "[{}] is not a grad varnme.".format(grad_var_name) - return grad_var_name[:grad_var_name.find("@GRAD")] - - def _get_op_by_id(ops, id): - for op in ops: - if op.desc.id() == id: - return op - return None + op_dist_attr = OperatorDistributedAttribute() + op_dist_attr.process_mesh = process_mesh + op_dist_attr.set_output_dims_mapping(grad_var.name, + dims_mapping) + self._dist_context.set_op_dist_attr_for_program(ops[idx], + op_dist_attr) + continue - if dist_context is None: - dist_context = get_default_distributed_context() - - first_backward_op_idx = -1 - for idx, op in enumerate(auto_parallel_main_prog.global_block().ops): - if int(op.attr('op_role')) == int( - int(core.op_proto_and_checker_maker.OpRole.Backward) | int( - core.op_proto_and_checker_maker.OpRole.Loss)): - assert op.type == "fill_constant" - first_backward_op_idx = idx - break - - assert first_backward_op_idx >= 0, "No backward procedure found in this program." - - ops = list(auto_parallel_main_prog.global_block().ops) - vars = auto_parallel_main_prog.global_block().vars - dist_op_context = dist_context.dist_op_context - - for idx in range(first_backward_op_idx, len(ops)): - - # complete the initial grad loss op - if idx == first_backward_op_idx: - assert ops[idx].type == "fill_constant" - assert len( - ops[idx].input_arg_names - ) == 0, "first backward op should has only ONE output, but got [{}]".format( - len(ops[idx].input_arg_names)) - assert len( - ops[idx].output_arg_names - ) == 1, "first backward op should has only ONE output, but got [{}]".format( - len(ops[idx].output_arg_names)) - - grad_var = vars[ops[idx].output_arg_names[0]] - forward_var_name = _get_forward_varname_from_grad_varname( - grad_var.name) - forward_var = vars[forward_var_name] - - # TODO complete other attribte for grad var - tensor_dist_attr = TensorDistributedAttribute() - process_mesh = dist_context.get_tensor_dist_attr_for_program( - forward_var).process_mesh - dims_mapping = dist_context.get_tensor_dist_attr_for_program( - forward_var).dims_mapping - tensor_dist_attr.dims_mapping = dims_mapping - tensor_dist_attr.process_mesh = process_mesh - dist_context.set_tensor_dist_attr_for_program(grad_var, - tensor_dist_attr) - - op_dist_attr = OperatorDistributedAttribute() - op_dist_attr.process_mesh = process_mesh - op_dist_attr.set_output_dims_mapping(grad_var.name, dims_mapping) - dist_context.set_op_dist_attr_for_program(ops[idx], op_dist_attr) - continue - - # complete the annotation of grad op (xxx_grad op or sum op) - # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id - grad_op = ops[idx] - if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id: - # TODO support the case where one forward op corresponding to multiple xxx_grad op - forward_op = _get_op_by_id( - ops[:first_backward_op_idx], - dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]) - assert forward_op is not None - - # op dist attr - forward_op_dist_attr = dist_context.get_op_dist_attr_for_program( - forward_op) - forward_op_process_mesh = forward_op_dist_attr.process_mesh - grad_op_dist_attr = OperatorDistributedAttribute() - grad_op_dist_attr.process_mesh = forward_op_process_mesh - - # var - for input_name in grad_op.input_arg_names: - input_var = vars[input_name] - ref_dims_mapping = None - if "@GRAD" in input_name: - forward_name = _get_forward_varname_from_grad_varname( - input_name) - ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping( - forward_name) - else: - if forward_op_dist_attr.get_input_dims_mapping(input_name): - ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping( + # complete the annotation of grad op (xxx_grad op or sum op) + # xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id + grad_op = ops[idx] + if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id: + # TODO support the case where one forward op corresponding to multiple xxx_grad op + forward_op = _get_op_by_id( + ops[:first_backward_op_idx], + dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()]) + assert forward_op is not None + + # op dist attr + forward_op_dist_attr = self._dist_context.get_op_dist_attr_for_program( + forward_op) + forward_op_process_mesh = forward_op_dist_attr.process_mesh + grad_op_dist_attr = OperatorDistributedAttribute() + grad_op_dist_attr.process_mesh = forward_op_process_mesh + + # var + for input_name in grad_op.input_arg_names: + input_var = vars[input_name] + ref_dims_mapping = None + if "@GRAD" in input_name: + forward_name = _get_forward_varname_from_grad_varname( input_name) - else: ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping( - input_name) - - assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format( - input_var.name) - grad_op_dist_attr.set_input_dims_mapping(input_name, - ref_dims_mapping) - - for output_name in grad_op.desc.output_names(): - assert len(grad_op.desc.output(output_name)) in [0, 1] - if _is_grad_var_name(output_name): - input_name = _get_forward_varname_from_grad_varname( - output_name) - else: - assert grad_op.type in [ - "cast", "c_identity", "c_allreduce_sum" - ] - input_name = "X" - assert input_name in forward_op.desc.input_names( - ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format( - output_name, grad_op.type, input_name) - if len(grad_op.desc.output(output_name)) == 1: - # tensor dist attr - output_var = vars[grad_op.desc.output(output_name)[0]] - forward_name = _get_forward_varname_from_grad_varname( - output_var.name) - ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping( - forward_name) - - output_var_dist_attr = TensorDistributedAttribute() - output_var_dist_attr.dims_mapping = ref_dims_mapping - output_var_dist_attr.process_mesh = forward_op_process_mesh - dist_context.set_tensor_dist_attr_for_program( - output_var, output_var_dist_attr) - - grad_op_dist_attr.set_output_dims_mapping(output_var.name, - ref_dims_mapping) - - dist_context.set_op_dist_attr_for_program(grad_op, - grad_op_dist_attr) - - # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id - else: - assert grad_op.type == "sum", "got unexpect op [{}]".format( - str(grad_op.type)) - assert all(map(_is_grad_var_name, grad_op.input_arg_names)) - assert len(grad_op.output_arg_names) == 1 - - ref_forward_var_name = _get_forward_varname_from_grad_varname( - grad_op.output_arg_names[0]) - forward_var = vars[ref_forward_var_name] - ref_forward_var_dims_mapping = dist_context.get_tensor_dist_attr_for_program( - forward_var).dims_mapping - ref_forward_var_process_mesh = dist_context.get_tensor_dist_attr_for_program( - forward_var).process_mesh - - # output - tensor_dist_attr = TensorDistributedAttribute() - tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping - tensor_dist_attr.process_mesh = ref_forward_var_process_mesh - dist_context.set_tensor_dist_attr_for_program( - vars[grad_op.output_arg_names[0]], tensor_dist_attr) - - # op - grad_op_dist_attr = OperatorDistributedAttribute() - grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh - for var_name in grad_op.input_arg_names: - assert _get_forward_varname_from_grad_varname( - var_name) == ref_forward_var_name - grad_op_dist_attr.set_input_dims_mapping( - var_name, ref_forward_var_dims_mapping) - - grad_op_dist_attr.set_output_dims_mapping( - grad_op.output_arg_names[0], ref_forward_var_dims_mapping) - dist_context.set_op_dist_attr_for_program(grad_op, - grad_op_dist_attr) - - -def complete_update_annotation(auto_parallel_main_prog, dist_context): - """Complete the annotation of vars and ops in the update phase for parallel program.""" - - if dist_context is None: - dist_context = get_default_distributed_context() - - ops = list(auto_parallel_main_prog.global_block().ops) - vars = auto_parallel_main_prog.global_block().vars - learning_rate_completed = False - - for idx in range(len(ops)): - - # complete the annotation of the optimizer op. - # TODO to add attribute for moment var - op = ops[idx] - if int(op.attr('op_role')) == int(OpRole.Optimize): - if op.type == "clip_by_norm": - - param_grad = vars[op.input("X")[0]] - param_grad_dist_attr = dist_context.get_tensor_dist_attr_for_program( - param_grad) - assert param_grad_dist_attr is not None - ref_process_mesh = param_grad_dist_attr.process_mesh - ref_dims_mapping = param_grad_dist_attr.dims_mapping - - out = vars[op.output("Out")[0]] - out_dist_attr = TensorDistributedAttribute() - out_dist_attr.process_mesh = ref_process_mesh - out_dist_attr.dims_mapping = ref_dims_mapping - dist_context.set_tensor_dist_attr_for_program(out, - out_dist_attr) + forward_name) + else: + if forward_op_dist_attr.get_input_dims_mapping( + input_name): + ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping( + input_name) + else: + ref_dims_mapping = forward_op_dist_attr.get_output_dims_mapping( + input_name) + + assert ref_dims_mapping is not None, "[{}] 's dims mapping is NONE".format( + input_var.name) + grad_op_dist_attr.set_input_dims_mapping(input_name, + ref_dims_mapping) - op_dist_attr = OperatorDistributedAttribute() - op_dist_attr.process_mesh = ref_process_mesh - op_dist_attr.set_input_dist_attr(param_grad.name, - param_grad_dist_attr) - op_dist_attr.set_output_dist_attr(out.name, out_dist_attr) - dist_context.set_op_dist_attr_for_program(op, op_dist_attr) - - if "Grad" in op.input_names and "Param" in ops[idx].input_names: - assert len(op.input( - "Param")) == 1, "Only support one-to-one now." - assert len(op.input( - "Grad")) == 1, "Only support one-to-one now." - param = vars[op.input("Param")[0]] - grad_var = vars[op.input("Grad")[0]] - - param_dist_attr = dist_context.get_tensor_dist_attr_for_program( - param) - assert param_dist_attr is not None - ref_process_mesh = dist_context.get_tensor_dist_attr_for_program( - param).process_mesh - assert ref_process_mesh is not None - ref_dims_mapping = dist_context.get_tensor_dist_attr_for_program( - param).dims_mapping - assert ref_dims_mapping is not None - op_dist_attr = OperatorDistributedAttribute() - op_dist_attr.process_mesh = ref_process_mesh - op_dist_attr.set_input_dims_mapping(grad_var.name, - ref_dims_mapping) - op_dist_attr.set_input_dims_mapping(param.name, - ref_dims_mapping) - op_dist_attr.set_output_dims_mapping(param.name, - ref_dims_mapping) - learning_var = vars[op.input("LearningRate")[0]] - op_dist_attr.set_input_dims_mapping(learning_var.name, [-1]) - op_dist_attr.set_output_dims_mapping(learning_var.name, [-1]) - - if not learning_rate_completed: - learning_rate_completed = True - var_dist_attr = TensorDistributedAttribute() - var_dist_attr.process_mesh = ref_process_mesh - var_dist_attr.dims_mapping = [-1] - dist_context.set_tensor_dist_attr_for_program(learning_var, - var_dist_attr) - - for input_name in op.desc.input_names(): - - if input_name in [ - 'Param', 'Grad', 'LearningRate', "SkipUpdate", - "Beta1Tensor", "Beta2Tensor", "EpsilonTensor", - "MasterParam" - ]: - continue + for output_name in grad_op.desc.output_names(): + assert len(grad_op.desc.output(output_name)) in [0, 1] + if _is_grad_var_name(output_name): + input_name = _get_forward_varname_from_grad_varname( + output_name) + else: + assert grad_op.type in [ + "cast", "c_identity", "c_allreduce_sum" + ] + input_name = "X" + assert input_name in forward_op.desc.input_names( + ), "var [{}] in op [{}]'s output but could not find [{}] in its forward op".format( + output_name, grad_op.type, input_name) + if len(grad_op.desc.output(output_name)) == 1: + # tensor dist attr + output_var = vars[grad_op.desc.output(output_name)[0]] + forward_name = _get_forward_varname_from_grad_varname( + output_var.name) + ref_dims_mapping = forward_op_dist_attr.get_input_dims_mapping( + forward_name) - assert len(op.desc.input(input_name)) == 1 - input_var = vars[op.desc.input(input_name)[0]] - input_var_attr = TensorDistributedAttribute() + output_var_dist_attr = TensorDistributedAttribute() + output_var_dist_attr.dims_mapping = ref_dims_mapping + output_var_dist_attr.process_mesh = forward_op_process_mesh + self._dist_context.set_tensor_dist_attr_for_program( + output_var, output_var_dist_attr) - if "Beta1Pow" in input_name or "Beta2Pow" in input_name: - input_var_attr.dims_mapping = [-1] - op_dist_attr.set_input_dims_mapping(input_var.name, - [-1]) - op_dist_attr.set_output_dims_mapping(input_var.name, - [-1]) - else: - assert "Moment" in input_name - input_var_attr.dims_mapping = ref_dims_mapping - op_dist_attr.set_input_dims_mapping(input_var.name, - ref_dims_mapping) - op_dist_attr.set_output_dims_mapping(input_var.name, - ref_dims_mapping) + grad_op_dist_attr.set_output_dims_mapping( + output_var.name, ref_dims_mapping) - input_var_attr.process_mesh = ref_process_mesh - dist_context.set_tensor_dist_attr_for_program( - input_var, input_var_attr) + self._dist_context.set_op_dist_attr_for_program( + grad_op, grad_op_dist_attr) - dist_context.set_op_dist_attr_for_program(op, op_dist_attr) - continue + # only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id + else: + assert grad_op.type == "sum", "got unexpect op [{}]".format( + str(grad_op.type)) + assert all(map(_is_grad_var_name, grad_op.input_arg_names)) + assert len(grad_op.output_arg_names) == 1 + + ref_forward_var_name = _get_forward_varname_from_grad_varname( + grad_op.output_arg_names[0]) + forward_var = vars[ref_forward_var_name] + ref_forward_var_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program( + forward_var).dims_mapping + ref_forward_var_process_mesh = self._dist_context.get_tensor_dist_attr_for_program( + forward_var).process_mesh + + # output + tensor_dist_attr = TensorDistributedAttribute() + tensor_dist_attr.dims_mapping = ref_forward_var_dims_mapping + tensor_dist_attr.process_mesh = ref_forward_var_process_mesh + self._dist_context.set_tensor_dist_attr_for_program( + vars[grad_op.output_arg_names[0]], tensor_dist_attr) + + # op + grad_op_dist_attr = OperatorDistributedAttribute() + grad_op_dist_attr.process_mesh = ref_forward_var_process_mesh + for var_name in grad_op.input_arg_names: + assert _get_forward_varname_from_grad_varname( + var_name) == ref_forward_var_name + grad_op_dist_attr.set_input_dims_mapping( + var_name, ref_forward_var_dims_mapping) + + grad_op_dist_attr.set_output_dims_mapping( + grad_op.output_arg_names[0], ref_forward_var_dims_mapping) + self._dist_context.set_op_dist_attr_for_program( + grad_op, grad_op_dist_attr) + + def complete_update_annotation(self, serial_main_program): + """Complete the annotation of vars and ops in the update phase for parallel program.""" + ops = list(serial_main_program.global_block().ops) + vars = serial_main_program.global_block().vars + learning_rate_completed = False + + for idx in range(len(ops)): + + # complete the annotation of the optimizer op. + # TODO to add attribute for moment var + op = ops[idx] + if int(op.attr('op_role')) == int(OpRole.Optimize): + + if "Grad" in op.input_names and "Param" in ops[idx].input_names: + assert len(op.input( + "Param")) == 1, "Only support one-to-one now." + assert len(op.input( + "Grad")) == 1, "Only support one-to-one now." + param = vars[op.input("Param")[0]] + grad_var = vars[op.input("Grad")[0]] + + param_dist_attr = self._dist_context.get_tensor_dist_attr_for_program( + param) + assert param_dist_attr is not None + ref_process_mesh = self._dist_context.get_tensor_dist_attr_for_program( + param).process_mesh + assert ref_process_mesh is not None + ref_dims_mapping = self._dist_context.get_tensor_dist_attr_for_program( + param).dims_mapping + assert ref_dims_mapping is not None + op_dist_attr = OperatorDistributedAttribute() + op_dist_attr.process_mesh = ref_process_mesh + op_dist_attr.set_input_dims_mapping(grad_var.name, + ref_dims_mapping) + op_dist_attr.set_input_dims_mapping(param.name, + ref_dims_mapping) + op_dist_attr.set_output_dims_mapping(param.name, + ref_dims_mapping) + learning_var = vars[op.input("LearningRate")[0]] + op_dist_attr.set_input_dims_mapping(learning_var.name, [-1]) + op_dist_attr.set_output_dims_mapping(learning_var.name, + [-1]) + + if not learning_rate_completed: + learning_rate_completed = True + var_dist_attr = TensorDistributedAttribute() + var_dist_attr.process_mesh = ref_process_mesh + var_dist_attr.dims_mapping = [-1] + self._dist_context.set_tensor_dist_attr_for_program( + learning_var, var_dist_attr) + + for input_name in op.desc.input_names(): + + if input_name in [ + 'Param', 'Grad', 'LearningRate', "SkipUpdate", + "Beta1Tensor", "Beta2Tensor", "EpsilonTensor", + "MasterParam" + ]: + continue + + assert len(op.desc.input(input_name)) == 1 + input_var = vars[op.desc.input(input_name)[0]] + input_var_attr = TensorDistributedAttribute() + + if "Beta1Pow" in input_name or "Beta2Pow" in input_name: + input_var_attr.dims_mapping = [-1] + op_dist_attr.set_input_dims_mapping(input_var.name, + [-1]) + op_dist_attr.set_output_dims_mapping(input_var.name, + [-1]) + else: + assert "Moment" in input_name + input_var_attr.dims_mapping = ref_dims_mapping + op_dist_attr.set_input_dims_mapping( + input_var.name, ref_dims_mapping) + op_dist_attr.set_output_dims_mapping( + input_var.name, ref_dims_mapping) + + input_var_attr.process_mesh = ref_process_mesh + self._dist_context.set_tensor_dist_attr_for_program( + input_var, input_var_attr) + + self._dist_context.set_op_dist_attr_for_program( + op, op_dist_attr) + continue diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py index ad3a53ff17d76..e06811df88179 100644 --- a/python/paddle/distributed/auto_parallel/dist_context.py +++ b/python/paddle/distributed/auto_parallel/dist_context.py @@ -247,23 +247,23 @@ def get_op_dist_attr_for_graph(self, serial_op_node): # new_dist_op = DistributedOperator(dist_op.serial_op, dist_attr) # self._dist_ops_for_graph[serial_op_node_id] = new_dist_op - # def get_dist_attr_for_graph(self, serial_node): - # if serial_node.is_var() and serial_node.var() is not None: - # serial_tensor_node_id = serial_node.id() - # dist_tensor = self._dist_tensors_for_graph.get( - # serial_tensor_node_id, None) - # if dist_tensor: - # return dist_tensor.dist_attr - # else: - # return None - # if serial_node.is_op() and serial_node.op() is not None: - # serial_op_node_id = serial_node.id() - # dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) - # if dist_op: - # return dist_op.dist_attr - # else: - # return None - # return None + def get_dist_attr_for_graph(self, serial_node): + if serial_node.is_var() and serial_node.var() is not None: + serial_tensor_node_id = serial_node.id() + dist_tensor = self._dist_tensors_for_graph.get( + serial_tensor_node_id, None) + if dist_tensor: + return dist_tensor.dist_attr + else: + return None + if serial_node.is_op() and serial_node.op() is not None: + serial_op_node_id = serial_node.id() + dist_op = self._dist_ops_for_graph.get(serial_op_node_id, None) + if dist_op: + return dist_op.dist_attr + else: + return None + return None def init_dist_attr_for_program(self): assert self._serial_program, \ diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index d6035d02953ac..43f5fa264790f 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -32,7 +32,7 @@ from .dist_context import DistributedContext from .dist_context import get_default_distributed_context from .dist_context import set_default_distributed_context -from .completion import complete_annotation, complete_backward_annotation, complete_update_annotation +from .completion import Completer from .partitioner import Partitioner from .process_group import get_all_process_groups from .process_group import get_process_group @@ -130,8 +130,8 @@ def _generate_backward(self, main_program, startup_program, loss, no_grad_set, callbacks, distop_context=self._dist_context.dist_op_context) - complete_backward_annotation( - main_program, dist_context=self._dist_context) + self._completer = Completer(self._dist_context) + self._completer.complete_backward_annotation(main_program) return params_grads @@ -142,8 +142,8 @@ def _apply_optimize(self, main_program, startup_program, params_grads): params_grads) # update completion - complete_update_annotation( - main_program, dist_context=self._dist_context) + self._completer = Completer(self._dist_context) + self._completer.complete_update_annotation(main_program) return optimize_ops @@ -179,8 +179,9 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False): # Annotation completion self._dist_context = DistributedContext() _logger.info("Start annotation dist attr.") - completed_main_program = complete_annotation(serial_main_program, - self._dist_context) + self._completer = Completer(self._dist_context) + completed_main_program = self._completer.complete_forward_annotation( + serial_main_program) else: completed_main_program = serial_main_program self._dist_context = copy.deepcopy(dist_context) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py index 05d71aca5db2c..bc4f1671f4e20 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py @@ -27,6 +27,7 @@ from paddle.fluid import layers from paddle.nn.layer.transformer import _convert_param_attr_to_list import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix @@ -154,10 +155,9 @@ def test_mlp_dp(self): dist_context = DistributedContext() train_program, start_program = mlp_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_mlp_mp(self): @@ -171,10 +171,9 @@ def test_mlp_mp(self): dist_context = DistributedContext() train_program, start_program = mlp_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_mlp_dp_mp(self): @@ -189,10 +188,9 @@ def test_mlp_dp_mp(self): dist_context = DistributedContext() train_program, start_program = mlp_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) # def test_mlp_misc(self): @@ -212,8 +210,8 @@ def test_mlp_dp_mp(self): # train_program, start_program = mlp_pretrain_forward(train_program, # start_program) # # pdb.set_trace() - # complete_train_program = auto.complete_annotation(train_program, - # dist_context) + # completer = Completer(dist_context) + # complete_train_program = auto.completer.complete_forward_annotation(train_program) # # print_program_with_dist_attr(complete_train_program, # # dist_context) # dist_context.finalize_distributed_attr_for_program( @@ -423,8 +421,9 @@ def test_attn_dp(self): dist_context = DistributedContext() train_program, start_program = attn_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) # print_program_with_dist_attr(complete_train_program, # dist_context) self.assertTrue(dist_context.validate_dist_attr_for_program()) @@ -440,10 +439,9 @@ def test_attn_mp(self): dist_context = DistributedContext() train_program, start_program = attn_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_attn_dp_mp(self): @@ -458,10 +456,9 @@ def test_attn_dp_mp(self): dist_context = DistributedContext() train_program, start_program = attn_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) @@ -747,10 +744,9 @@ def test_decoder_dp(self): dist_context = DistributedContext() train_program, start_program = decoder_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_decoder_mp(self): @@ -764,10 +760,9 @@ def test_decoder_mp(self): dist_context = DistributedContext() train_program, start_program = decoder_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_decoder_dp_mp(self): @@ -782,10 +777,9 @@ def test_decoder_dp_mp(self): dist_context = DistributedContext() train_program, start_program = decoder_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py index c2c1e63155c3a..1293a9644027d 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py @@ -31,6 +31,7 @@ from paddle.distributed.fleet import fleet import paddle.static as static import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr from paddle.distributed.auto_parallel.dist_context import DistributedContext @@ -817,10 +818,9 @@ def test_gpt_dp(self): dist_context = DistributedContext() train_program, start_program = gpt_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_gpt_mp(self): @@ -834,10 +834,9 @@ def test_gpt_mp(self): dist_context = DistributedContext() train_program, start_program = gpt_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) def test_gpt_dp_mp(self): @@ -852,10 +851,9 @@ def test_gpt_dp_mp(self): dist_context = DistributedContext() train_program, start_program = gpt_pretrain_forward(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) - # print_program_with_dist_attr(complete_train_program, - # dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) self.assertTrue(dist_context.validate_dist_attr_for_program()) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py index 83254de61298b..fd19a5bd8b866 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py @@ -23,6 +23,7 @@ import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.partitioner import Partitioner @@ -154,8 +155,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): parallelizer._dist_context = dist_context # serial forward & backward completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) params_grads = parallelizer._generate_backward( complete_train_program, diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py index b21cbb5ae78bc..27de9f325063b 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py @@ -18,6 +18,7 @@ import paddle from paddle.fluid import core import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.partitioner import Partitioner @@ -42,8 +43,9 @@ def get_dist_prog(train_program, parallelizer._dist_context = dist_context # serial forward & backward completion - complete_train_program = auto.complete_annotation( - train_program, dist_context + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program ) if complete_train_program is None else complete_train_program # parallelizer._apply_serial_forward_pass(complete_train_program, diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py index 3a28595c833e0..9d4de771076cd 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py @@ -36,6 +36,7 @@ from paddle.distributed import fleet import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed.auto_parallel.partitioner import Partitioner @@ -433,6 +434,12 @@ def forward(self, input): out = F.gelu(out, approximate=True) out = self.linear1(out) + auto.shard_tensor( + out, + dist_attr={ + "process_mesh": _global_process_mesh[1], + "dims_mapping": [0, -1] + }) out = self.linear2(out) out = F.gelu(out, approximate=True) out = self.linear3(out) @@ -476,8 +483,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): parallelizer._dist_context = dist_context # auto completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) params_grads = parallelizer._generate_backward( complete_train_program, diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py index 21cf8a904b690..deff2144411fc 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py @@ -28,6 +28,7 @@ from paddle.fluid import layers from paddle.nn.layer.transformer import _convert_param_attr_to_list import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr from paddle.distributed.auto_parallel.utils import append_distributed_attr_suffix @@ -49,8 +50,9 @@ def get_programs(annotated_func): global _global_process_mesh dist_context.process_mesh = _global_process_mesh train_program, start_program = annotated_func(train_program, start_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) rank_id = 3 dist_strategy = fleet.DistributedStrategy() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py index dc2ad1d900f52..01e62d886e2b7 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py @@ -31,6 +31,7 @@ from paddle.distributed import fleet import paddle.static as static import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr from paddle.distributed.auto_parallel.dist_context import DistributedContext @@ -881,8 +882,9 @@ def test_gpt_dp_mp(self): dist_context.process_mesh = _global_process_mesh train_program, startup_program, loss = gpt_pretrain_forward( train_program, startup_program) - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) # serial backward pass params_grads = parallelizer._generate_backward( @@ -913,8 +915,9 @@ def test_gpt_dp_mp(self): "w") as fw: fw.write(str(auto_parallel_startup_prog)) # with open("./test_auto_parallel_partitioner_main_completed.txt", "w") as fw: - # from paddle.distributed.auto_parallel.completion import complete_backward_annotation - # complete_backward_annotation(auto_parallel_main_prog) + # from paddle.distributed.auto_parallel.completion import Completer + # completer = Completer() + # completer.complete_forward_annotation(auto_parallel_main_prog) # fw.write(str(auto_parallel_main_prog)) nrank = 4 # col parallel diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py index 614b996d26521..b234e25823f4b 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer @@ -152,8 +153,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): parallelizer._dist_context = dist_context # serial forward & backward completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) params_grads = parallelizer._generate_backward( complete_train_program, @@ -299,7 +301,6 @@ def test_mlp_pp(self): for key in list(_g_process_group_map.keys()): del _g_process_group_map[key] reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) - # print_program_with_dist_attr(dist_main_prog, dist_context) # check send and recv result self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py index cfbb7653fad8e..40847a769033a 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer @@ -116,8 +117,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): parallelizer._dist_context = dist_context # serial forward & backward completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) params_grads = parallelizer._generate_backward( complete_train_program, diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py index 272c1c212f08e..869bcd4c7ab32 100644 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -22,6 +22,7 @@ import paddle.nn.functional as F import paddle.utils as utils import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.completion import Completer from paddle.distributed.auto_parallel.dist_context import DistributedContext from paddle.distributed import fleet from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer @@ -132,8 +133,9 @@ def get_dist_prog(train_program, startup_program, dist_context, rank_id): parallelizer._dist_context = dist_context # serial forward & backward completion - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) params_grads = parallelizer._generate_backward( complete_train_program, @@ -263,8 +265,9 @@ def test_allgather(self): dist_context = DistributedContext() dist_strategy = fleet.DistributedStrategy() partitioner = Partitioner(dist_context, rank_id) - complete_train_program = auto.complete_annotation(train_program, - dist_context) + completer = Completer(dist_context) + complete_train_program = completer.complete_forward_annotation( + train_program) partitioned_main_prog, partitioned_startup_prog, partitioned_params_grads = partitioner.partition( complete_train_program, startup_program, []) reshard(partitioned_main_prog, partitioned_startup_prog, rank_id, diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py index ed64fa0630fa1..78ad64b1dd852 100755 --- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py @@ -154,7 +154,7 @@ def test_update(self): ops = train_program.global_block().ops vars = train_program.global_block().vars from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container - from paddle.distributed.auto_parallel.completion import is_elementwise_like_op + from paddle.distributed.auto_parallel.operators.common import is_elementwise_op from paddle.distributed.auto_parallel.dist_op import DistributedOperator for op in ops: @@ -163,7 +163,7 @@ def test_update(self): if dist_op_impl_container is None: op_dist_attr = dist_context.get_op_dist_attr_for_program(op) dist_op = DistributedOperator(op, op_dist_attr) - if is_elementwise_like_op(op.type): + if is_elementwise_op(op.type): changed = update_op_dims_mapping_by_elementwise_like_dist_impl( dist_op) self.assertFalse(changed) From ba51a6c8101714dbd03a60830e79c64cb9af7bef Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Fri, 21 Jan 2022 12:05:09 +0800 Subject: [PATCH 02/15] fix gcd and lcm data type (#39043) --- python/paddle/tensor/math.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index c4a92b1486d58..a476a8ccd120a 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -3559,8 +3559,8 @@ def gcd(x, y, name=None): If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output). Args: - x (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. - y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. + x (Tensor): An N-D Tensor, the data type is int32,int64. + y (Tensor): An N-D Tensor, the data type is int32,int64. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -3621,8 +3621,8 @@ def _gcd_body_fn(x, y): return x else: - check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd') - check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd') + check_variable_and_dtype(x, 'x', ['int32', 'int64'], 'gcd') + check_variable_and_dtype(y, 'y', ['int32', 'int64'], 'gcd') out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y]) return out @@ -3637,8 +3637,8 @@ def lcm(x, y, name=None): If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output). Args: - x (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. - y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. + x (Tensor): An N-D Tensor, the data type is int32,int64. + y (Tensor): An N-D Tensor, the data type is int32,int64. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: From 4adeff06aebf2d824e361caced9f94506a68533b Mon Sep 17 00:00:00 2001 From: FlyingQianMM <245467267@qq.com> Date: Fri, 21 Jan 2022 12:51:57 +0800 Subject: [PATCH 03/15] add block and grid loop for index_sample kernel to deal with a large-shape tensor (#37816) * add block and grid loop for index_sample kernel to deal with a large-shape tensor * fix code format * limit grid dim --- paddle/fluid/operators/index_sample_op.cu | 63 +++++++++++++++-------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu index 4260d0516e3cc..45f63c2b2fbd8 100644 --- a/paddle/fluid/operators/index_sample_op.cu +++ b/paddle/fluid/operators/index_sample_op.cu @@ -18,9 +18,22 @@ #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#define PREDEFINED_BLOCK_SIZE_X 512 +#define PREDEFINED_BLOCK_SIZE 1024 +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + namespace paddle { namespace operators { +namespace { +void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) { + dim3 max_grid_dim = ctx.template device_context() + .GetCUDAMaxGridDimSize(); + grid_dim->x = grid_dim->x < max_grid_dim.x ? grid_dim->x : max_grid_dim.x; + grid_dim->y = grid_dim->y < max_grid_dim.y ? grid_dim->y : max_grid_dim.y; +} +} + using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; @@ -28,14 +41,15 @@ template __global__ void IndexSampleForward(const IndexT* index, const T* in_data, T* out_data, size_t index_length, size_t input_length, size_t batch_size) { - int index_i = blockDim.x * blockIdx.x + threadIdx.x; - int index_j = blockDim.y * blockIdx.y + threadIdx.y; - int index_idx = index_j * index_length + index_i; - int in_idx = index_j * input_length + index_i; - - if (index_i < index_length & index_j < batch_size) { - IndexT sample_idx = index[index_idx]; - out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + out_data[index_idx] = in_data[in_idx - index_i + sample_idx]; + } } } @@ -44,18 +58,20 @@ __global__ void IndexSampleGrad(const IndexT* index, T* in_grad, const T* out_grad, size_t index_length, size_t input_length, size_t batch_size, bool same_data_in_row = true) { - int index_i = blockDim.x * blockIdx.x + threadIdx.x; - int index_j = blockDim.y * blockIdx.y + threadIdx.y; - int index_idx = index_j * index_length + index_i; - int in_idx = index_j * input_length + index_i; - - if (index_i < index_length & index_j < batch_size) { - IndexT sample_idx = index[index_idx]; - if (same_data_in_row) { - platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), - out_grad[sample_idx]); - } else { - in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; + unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y; + + for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) { + for (; index_i < index_length; index_i += blockDim.x * gridDim.x) { + unsigned int index_idx = index_j * index_length + index_i; + unsigned int in_idx = index_j * input_length + index_i; + IndexT sample_idx = index[index_idx]; + if (same_data_in_row) { + platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]), + out_grad[sample_idx]); + } else { + in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx]; + } } } } @@ -93,12 +109,14 @@ class IndexSampleKernel size_t index_length = index_dim[1]; auto block_width = platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); int block_height = platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; - + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); dim3 block_dim(block_width, block_height); dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); if (index_type == framework::proto::VarType::INT64) { const int64_t* index_data = index->data(); @@ -150,11 +168,14 @@ class IndexSampleGradKernel bool same_data_in_index_row = index_length == 1 ? false : true; auto block_width = platform::RoundToPowerOfTwo(index_length); + block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X); auto block_height = platform::RoundToPowerOfTwo(index_length * batch_size) / block_width; + block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width); dim3 block_dim(block_width, block_height); dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); + LimitGridDim(ctx, &grid_dim); math::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); From 89f903da1fba9527dc900266baf5a17e6711d7d8 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 21 Jan 2022 13:21:31 +0800 Subject: [PATCH 04/15] fix npu c_allgather int64 (#39099) --- paddle/fluid/operators/collective/c_allgather_op_npu.cc | 1 + paddle/fluid/platform/device/npu/hccl_helper.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc index 4fa27f5eb9bee..5ebcc9064f790 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -79,5 +79,6 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel, ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, ops::CAllGatherOpASCENDKernel, ops::CAllGatherOpASCENDKernel); diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index c2338fff02926..efbc56bee720b 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -41,6 +41,8 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { return HCCL_DATA_TYPE_FP32; } else if (type == framework::proto::VarType::FP16) { return HCCL_DATA_TYPE_FP16; + } else if (type == framework::proto::VarType::INT64) { + return HCCL_DATA_TYPE_INT64; } else if (type == framework::proto::VarType::INT32) { return HCCL_DATA_TYPE_INT32; } else if (type == framework::proto::VarType::INT8) { From cf6516ffab24cc6ebc8b167dba53567ab1e60eb6 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Fri, 21 Jan 2022 13:49:08 +0800 Subject: [PATCH 05/15] update recommend member (#39083) * update recommend member, test=document_fix * remove update of UB rule file, test=document_fix --- paddle/scripts/paddle_build.sh | 4 ++-- tools/ci_op_benchmark.sh | 4 ++-- tools/test_ci_op_benchmark.sh | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7a2fa58be4978..cf326a68e5948 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -2347,11 +2347,11 @@ function collect_ccache_hits() { function test_op_benchmark() { # The PR will pass quickly when get approval from specific person. - # Xreki 12538138, luotao1 6836917, Avin0323 23427135 + # Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ "${approval_line}" != "" ]; then - APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917) + APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "TRUE" ]; then echo "===================================" diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 48074c205774c..f2f83c8dfbb8d 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -286,11 +286,11 @@ function gpu_op_benchmark { # The PR will pass quickly when get approval from specific person. -# Xreki 12538138, luotao1 6836917, Avin0323 23427135 +# Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ -n "${approval_line}" ]; then - APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917) + APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "TRUE" ]; then LOG "[INFO] ===================================" diff --git a/tools/test_ci_op_benchmark.sh b/tools/test_ci_op_benchmark.sh index 25e613dd6bcd0..bf70d8bc3a495 100644 --- a/tools/test_ci_op_benchmark.sh +++ b/tools/test_ci_op_benchmark.sh @@ -273,7 +273,7 @@ function check_CHANGE_OP_MAP { done if [ $exit_code -ne 0 ]; then LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." - LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." + LOG "[INFO] Or you can apply for one RD (ZzSean(Recommend), Xreki, luotao1) approval to pass this PR." exit $exit_code fi } @@ -317,11 +317,11 @@ function gpu_op_benchmark { } # The PR will pass quickly when get approval from specific person. -# Xreki 12538138, luotao1 6836917, Avin0323 23427135 +# Xreki 12538138, luotao1 6836917, ZzSean 32410583 set +x approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ -n "${approval_line}" ]; then - APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 23427135 12538138 6836917) + APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 32410583 12538138 6836917) LOG "[INFO] current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "TRUE" ]; then LOG "[INFO] ===================================" From 4e23ba325db40a212ed30165143bcb5301bd106c Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 21 Jan 2022 14:55:00 +0800 Subject: [PATCH 06/15] [PTen]Migrate Dim and DDim from paddle::framework into pten namespace (#39053) * Migrate Dim and DDim from paddle::framework into pten namespace * fix paddle::framework::Array * fix framework::Array --- paddle/fluid/framework/CMakeLists.txt | 8 - paddle/fluid/framework/ddim.h | 230 +--------------- paddle/fluid/framework/ddim_test.cc | 84 ------ paddle/fluid/framework/dim.h | 82 +----- .../amp/check_finite_and_unscale_op.h | 2 +- .../operators/amp/update_loss_scaling_op.h | 2 +- paddle/fluid/operators/bce_loss_op.cu | 2 +- paddle/fluid/operators/bernoulli_op.h | 2 +- paddle/fluid/operators/bilateral_slice_op.h | 2 +- paddle/fluid/operators/bincount_op.cu | 2 +- paddle/fluid/operators/deformable_conv_func.h | 2 +- paddle/fluid/operators/dequantize_log_op.cu | 2 +- .../fluid/operators/detection/box_clip_op.cu | 2 +- .../detection/sigmoid_focal_loss_op.cu | 2 +- .../fluid/operators/detection/yolo_box_op.h | 2 +- paddle/fluid/operators/distribution_helper.h | 2 +- .../elementwise/elementwise_functor.h | 34 +-- paddle/fluid/operators/fake_quantize_op.h | 2 +- paddle/fluid/operators/grid_sampler_op.h | 2 +- paddle/fluid/operators/histogram_op.cu | 2 +- paddle/fluid/operators/huber_loss_op.h | 2 +- paddle/fluid/operators/interpolate_op.h | 2 +- paddle/fluid/operators/interpolate_v2_op.h | 2 +- .../kernel_primitives/datamover_primitives.h | 3 +- paddle/fluid/operators/kldiv_loss_op.h | 2 +- paddle/fluid/operators/lstm_unit_op.cu | 2 +- paddle/fluid/operators/math.h | 2 +- paddle/fluid/operators/math/algorithm.h | 2 +- .../fluid/operators/math/complex_functors.h | 2 +- paddle/fluid/operators/math/cos_sim_functor.h | 2 +- paddle/fluid/operators/math/cross_entropy.h | 2 +- paddle/fluid/operators/math/depthwise_conv.h | 2 +- .../math/detail/activation_functions.h | 2 +- .../fluid/operators/math/detail/gru_kernel.h | 2 +- .../fluid/operators/math/detail/lstm_kernel.h | 2 +- paddle/fluid/operators/math/maxouting.h | 2 +- paddle/fluid/operators/math/pooling.h | 2 +- .../fluid/operators/modified_huber_loss_op.cu | 2 +- .../fluid/operators/modified_huber_loss_op.h | 2 +- paddle/fluid/operators/multinomial_op.h | 2 +- paddle/fluid/operators/nll_loss_op.cu | 2 +- paddle/fluid/operators/roll_op.cu | 14 +- .../sigmoid_cross_entropy_with_logits_op.cu | 2 +- paddle/fluid/operators/smooth_l1_loss_op.h | 2 +- paddle/fluid/operators/unstack_op.h | 1 - paddle/fluid/platform/aligned_vector.h | 2 +- paddle/fluid/platform/eigen_ext.h | 2 +- paddle/fluid/platform/transform.h | 2 +- paddle/fluid/platform/transform_test.cu | 2 +- paddle/pten/api/include/tensor.h | 10 +- paddle/pten/api/lib/tensor.cc | 6 +- paddle/pten/core/CMakeLists.txt | 9 + paddle/{fluid/framework => pten/core}/array.h | 10 +- paddle/{fluid/framework => pten/core}/ddim.cc | 85 +++--- paddle/pten/core/ddim.h | 257 ++++++++++++++++++ paddle/pten/core/ddim_test.cc | 83 ++++++ paddle/pten/core/dim.h | 100 +++++++ .../framework => pten/core}/dim_test.cu | 41 +-- .../platform => pten/core}/hostdevice.h | 5 +- paddle/pten/core/tensor_base.h | 4 +- paddle/pten/core/tensor_meta.h | 4 +- .../core}/unroll_array_ops.h | 8 +- .../core}/unroll_array_ops_test.cc | 8 +- paddle/pten/infermeta/binary.cc | 8 +- paddle/pten/infermeta/nullary.cc | 4 +- paddle/pten/infermeta/unary.cc | 28 +- paddle/pten/kernels/cpu/elementwise.h | 4 +- paddle/pten/kernels/cpu/reduce.h | 4 +- paddle/pten/kernels/empty_kernel.cc | 2 +- paddle/pten/kernels/flatten_grad_kernel.cc | 3 +- paddle/pten/kernels/funcs/common_shape.h | 2 +- paddle/pten/kernels/funcs/elementwise_base.h | 26 +- .../pten/kernels/funcs/elementwise_functor.h | 2 +- paddle/pten/kernels/funcs/transpose.cc | 6 +- paddle/pten/kernels/funcs/transpose.cu | 6 +- paddle/pten/kernels/funcs/transpose.h | 2 +- paddle/pten/kernels/gpu/elementwise.h | 37 ++- paddle/pten/kernels/gpu/reduce.h | 20 +- .../pten/kernels/impl/dot_grad_kernel_impl.h | 4 +- paddle/pten/kernels/impl/full_kernel_impl.h | 2 +- .../kernels/impl/matmul_grad_kernel_impl.h | 4 +- paddle/pten/kernels/impl/matmul_kernel_impl.h | 10 +- paddle/pten/tests/api/test_cast_api.cc | 2 +- paddle/pten/tests/api/test_conj_api.cc | 2 +- paddle/pten/tests/api/test_dot_api.cc | 2 +- paddle/pten/tests/api/test_elementwise_api.cc | 2 +- paddle/pten/tests/api/test_empty_api.cc | 2 +- paddle/pten/tests/api/test_fill_api.cc | 2 +- paddle/pten/tests/api/test_flatten_api.cc | 2 +- paddle/pten/tests/api/test_matmul_api.cc | 2 +- paddle/pten/tests/api/test_mean_api.cc | 2 +- paddle/pten/tests/api/test_reshape_api.cc | 2 +- paddle/pten/tests/api/test_scale_api.cc | 2 +- paddle/pten/tests/api/test_sum_api.cc | 2 +- paddle/pten/tests/api/test_to_api.cc | 2 +- .../pten/tests/kernels/test_cast_dev_api.cc | 2 +- .../pten/tests/kernels/test_conj_dev_api.cc | 2 +- .../pten/tests/kernels/test_copy_dev_api.cc | 2 +- .../tests/kernels/test_creation_dev_api.cc | 2 +- paddle/pten/tests/kernels/test_dot_dev_api.cc | 2 +- .../tests/kernels/test_elementwise_dev_api.cc | 2 +- .../tests/kernels/test_flatten_dev_api.cc | 2 +- .../pten/tests/kernels/test_matmul_dev_api.cc | 2 +- .../pten/tests/kernels/test_mean_dev_api.cc | 2 +- .../tests/kernels/test_reshape_dev_api.cc | 2 +- .../pten/tests/kernels/test_scale_dev_api.cc | 2 +- paddle/pten/tests/kernels/test_sum_dev_api.cc | 2 +- 107 files changed, 734 insertions(+), 658 deletions(-) delete mode 100644 paddle/fluid/framework/ddim_test.cc rename paddle/{fluid/framework => pten/core}/array.h (94%) rename paddle/{fluid/framework => pten/core}/ddim.cc (77%) create mode 100644 paddle/pten/core/ddim.h create mode 100644 paddle/pten/core/ddim_test.cc create mode 100644 paddle/pten/core/dim.h rename paddle/{fluid/framework => pten/core}/dim_test.cu (62%) rename paddle/{fluid/platform => pten/core}/hostdevice.h (89%) rename paddle/{fluid/framework => pten/core}/unroll_array_ops.h (96%) rename paddle/{fluid/framework => pten/core}/unroll_array_ops_test.cc (92%) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 902943d14ff9d..83e5c1c17925e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -55,14 +55,6 @@ proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto cc_library(string_array SRCS string_array.cc DEPS utf8proc) -cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) -cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) -if(WITH_GPU) - nv_test(dim_test SRCS dim_test.cu DEPS ddim) -elseif(WITH_ROCM) - hip_test(dim_test SRCS dim_test.cu DEPS ddim) -endif() -cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) if(WITH_GPU) diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 565e0b430dfdc..d150cca9d4c67 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -14,237 +14,13 @@ limitations under the License. */ #pragma once -#include -#include -#include -#include - -#include "paddle/fluid/framework/dim.h" +#include "paddle/pten/core/ddim.h" namespace paddle { namespace framework { -#define PADDLE_VISIT_DDIM_BASE(rank, callback) \ - case (rank): { \ - constexpr auto kRank = (rank); \ - return (callback); \ - } - -#define PADDLE_VISIT_DDIM(rank, callback) \ - switch (rank) { \ - PADDLE_VISIT_DDIM_BASE(0, callback); \ - PADDLE_VISIT_DDIM_BASE(1, callback); \ - PADDLE_VISIT_DDIM_BASE(2, callback); \ - PADDLE_VISIT_DDIM_BASE(3, callback); \ - PADDLE_VISIT_DDIM_BASE(4, callback); \ - PADDLE_VISIT_DDIM_BASE(5, callback); \ - PADDLE_VISIT_DDIM_BASE(6, callback); \ - PADDLE_VISIT_DDIM_BASE(7, callback); \ - PADDLE_VISIT_DDIM_BASE(8, callback); \ - PADDLE_VISIT_DDIM_BASE(9, callback); \ - default: \ - PADDLE_THROW(platform::errors::Unimplemented( \ - "Invalid dimension to be accessed. Now only supports access to " \ - "dimension 0 to 9, but received dimension is %d.", \ - rank)); \ - } - -template -inline void dynamic_dim_assign(const T1* in, T2* out, int n) { - PADDLE_VISIT_DDIM(n, (static_dim_assign(in, out))); -} - -/** - * \brief A dynamically sized dimension. - * - * The number of dimensions must be between [1, 9]. - */ -class DDim { - public: - constexpr static int kMaxRank = 9; - - DDim() : rank_(1) { dim_[0] = 0; } - - DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); } - - DDim(const int* d, int n) : rank_(n) { - dynamic_dim_assign(d, dim_.GetMutable(), n); - } - - DDim(const int64_t* d, int n) : rank_(n) { - dynamic_dim_assign(d, dim_.GetMutable(), n); - } - - template - /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT - UnsafeCast() = in; - } - - /*implicit*/ DDim(std::initializer_list init_list) - : DDim(init_list.begin(), init_list.size()) {} - - inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); } - - template - inline DDim& operator=(const Dim& dim) { - rank_ = D; - UnsafeCast() = dim; - return *this; - } - - inline int64_t& operator[](int idx) { return dim_[idx]; } - - inline int64_t operator[](int idx) const { return dim_[idx]; } - - int64_t& at(int idx) { - PADDLE_ENFORCE_GE(idx, 0, - platform::errors::InvalidArgument( - "Invalid DDim index to be accessed. The valid index " - "is between 0 and %d, but received index is %d.", - rank_, idx)); - PADDLE_ENFORCE_LT(idx, rank_, - platform::errors::InvalidArgument( - "Invalid DDim index to be accessed. The valid index " - "is between 0 and %d, but received index is %d.", - rank_, idx)); - return dim_[idx]; - } - - int64_t at(int idx) const { - PADDLE_ENFORCE_GE(idx, 0, - platform::errors::InvalidArgument( - "Invalid DDim index to be accessed. The valid index " - "is between 0 and %d, but received index is %d.", - rank_, idx)); - PADDLE_ENFORCE_LT(idx, rank_, - platform::errors::InvalidArgument( - "Invalid DDim index to be accessed. The valid index " - "is between 0 and %d, but received index is %d.", - rank_, idx)); - return dim_[idx]; - } - - template - typename std::result_of&)>::type apply_visitor( - Visitor&& visitor) { - PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); - } - - template - typename std::result_of&)>::type apply_visitor( - Visitor&& visitor) const { - PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); - } - - bool operator==(const DDim& d) const; - - bool operator!=(const DDim& d) const; - - inline const int64_t* Get() const { return dim_.Get(); } - - inline int64_t* GetMutable() { return dim_.GetMutable(); } - - inline int size() const { return rank_; } - - std::string to_str() const; - - DDim reshape(const std::vector& shape) const; - - DDim transpose(const std::vector& axis) const; - - private: - template - inline Dim& UnsafeCast() { - static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); - auto* p = static_cast(&dim_); - return *reinterpret_cast*>(p); - } - - template - inline const Dim& UnsafeCast() const { - static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); - auto* p = static_cast(&dim_); - return *reinterpret_cast*>(p); - } - - inline DDim& CopyFrom(const DDim& ddim) { - PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast())); - } - - friend DDim stride(const DDim& ddim); - friend DDim stride_numel(const DDim& ddim); - - private: - Dim dim_; - int rank_; -}; - -#undef PADDLE_VISIT_DDIM_BASE -#undef PADDLE_VISIT_DDIM - -/** - * \brief Make a DDim from std::vector - * - * \param dims An vector of ints. Must be sized between [1, 9] - */ -DDim make_ddim(const std::vector& dims); - -DDim make_ddim(const std::vector& dims); - -/** - * \brief Make a DDim from an initializer list - * - * \param dims An initializer list of ints. Must be sized between [1, 9] - * - */ -DDim make_ddim(std::initializer_list dims); - -template -std::vector vectorize(const DDim& ddim) { - std::vector result(DDim::kMaxRank); - dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); - result.resize(ddim.size()); - return result; -} - -int64_t product(const DDim& ddim); - -bool contain_unknown_dim(const DDim& ddim); - -/** - * \brief Slice a ddim - * - * Slice dim with [begin, end). - * e.g. DDim d = make_ddim({1,2,3,4,5}); - * slice_ddim(d, 1, 3); ====> {2,3} - */ -DDim slice_ddim(const DDim& dim, int begin, int end); - -/** - * \brief What is the length of this dimension? - * - * \param Dynamic dimension to inspect - */ - -int arity(const DDim& ddim); - -std::ostream& operator<<(std::ostream&, const DDim&); - -/** -* \brief Flatten dim to 3d -* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6}) -* flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30} -*/ -DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims); - -// Reshape a tensor to a matrix. The matrix's first dimension(column length) -// will be the product of tensor's first `num_col_dims` dimensions. -DDim flatten_to_2d(const DDim& src, int num_col_dims); - -DDim flatten_to_1d(const DDim& src); - -DDim stride(const DDim& ddim); +using DDim = pten::framework::DDim; +using namespace pten::framework; // NOLINT -DDim stride_numel(const DDim& ddim); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc deleted file mode 100644 index e89f77ae496c4..0000000000000 --- a/paddle/fluid/framework/ddim_test.cc +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/ddim.h" - -TEST(DDim, Equality) { - // construct a DDim from an initialization list - paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5}); - EXPECT_EQ(ddim[0], 9); - EXPECT_EQ(ddim[1], 1); - EXPECT_EQ(ddim[2], 5); - - // construct a DDim from a vector - std::vector vec({9, 1, 5}); - paddle::framework::DDim vddim = paddle::framework::make_ddim(vec); - EXPECT_EQ(ddim[0], 9); - EXPECT_EQ(ddim[1], 1); - EXPECT_EQ(ddim[2], 5); - - // mutate a DDim - ddim[1] = 2; - EXPECT_EQ(ddim[1], 2); - ddim[0] = 6; - EXPECT_EQ(ddim[0], 6); - - // vectorize a DDim - std::vector res_vec = paddle::framework::vectorize(vddim); - EXPECT_EQ(res_vec[0], 9); - EXPECT_EQ(res_vec[1], 1); - EXPECT_EQ(res_vec[2], 5); - paddle::framework::Dim<3> d(3, 2, 1); - res_vec = paddle::framework::vectorize(paddle::framework::DDim(d)); - EXPECT_EQ(res_vec[0], 3); - EXPECT_EQ(res_vec[1], 2); - EXPECT_EQ(res_vec[2], 1); - - // arity of a DDim - EXPECT_EQ(paddle::framework::arity(ddim), 3); - EXPECT_EQ(ddim.size(), 3); - - // product of a DDim - EXPECT_EQ(paddle::framework::product(vddim), 45); - EXPECT_EQ( - paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})), - 90); - - // slice a DDim - paddle::framework::DDim ddim2 = - paddle::framework::make_ddim({1, 2, 3, 4, 5, 6}); - paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5); - EXPECT_EQ(arity(ss), 3); - EXPECT_EQ(ss[0], 3); - EXPECT_EQ(ss[1], 4); - EXPECT_EQ(ss[2], 5); - paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6); - EXPECT_EQ(arity(ss2), 6); - EXPECT_EQ(ss2[0], 1); - EXPECT_EQ(ss2[1], 2); - EXPECT_EQ(ss2[2], 3); - EXPECT_EQ(ss2[3], 4); - EXPECT_EQ(ss2[4], 5); - EXPECT_EQ(ss2[5], 6); -} - -TEST(DDim, Print) { - // print a DDim - std::stringstream ss; - paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4}); - ss << ddim; - EXPECT_EQ("2, 3, 4", ss.str()); -} diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 66214b265fdf9..6abae4e731832 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -12,89 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. #pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/array.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/dim.h" namespace paddle { namespace framework { - -// Statically sized, statically indexed dimension template -class Dim : public Array { - public: - static_assert(D >= 0, "D must be not less than 0"); - - static constexpr int kRank = D; - using BaseClass = Array; - - inline Dim(int64_t head, const Dim& tail) { - (*this)[0] = head; - new (this->GetMutable() + 1) Dim(tail); - } - - template - HOSTDEVICE explicit Dim(int64_t head, Args... args) - : BaseClass(head, args...) {} - - /** Construct a Dim with each dimension set to the given index */ - HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } - - HOSTDEVICE Dim() = default; - - HOST std::string to_string() const; -}; - -// Product of a Dim -template -HOSTDEVICE inline int64_t product(const Dim& a) { - return UnrollProduct::Run(a.Get()); -} - -/** - * Helper function to create a Dim - * - * \param idxes The type of Dim constructed depends on the number of params - * - */ - -template -HOSTDEVICE inline Dim make_dim(Args... idxes) { - return Dim(idxes...); -} - -// Allows us to output a Dim -template -inline std::ostream& operator<<(std::ostream& os, const Dim& d) { - os << d[0]; - for (int i = 1; i < D; ++i) { - os << ", " << d[i]; - } - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { - return os; -} - -template -HOST std::string Dim::to_string() const { - std::stringstream stream; - stream << *this; - return stream.str(); -} - -template -inline void static_dim_assign(const T1* in, T2* out) { - UnrollAssign::Run(in, out); -} +using Dim = pten::framework::Dim; +using namespace pten::framework; // NOLINT } // namespace framework } // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h index 29b96c4a6704a..49ca2c3862a5e 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.h +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/isfinite_op.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.h b/paddle/fluid/operators/amp/update_loss_scaling_op.h index decc3c3b924c4..2c953d4eee373 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.h +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.h @@ -24,7 +24,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index d493dad132992..6595d6deccd9a 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/bernoulli_op.h b/paddle/fluid/operators/bernoulli_op.h index 40f285d11f194..da66742e08fd9 100644 --- a/paddle/fluid/operators/bernoulli_op.h +++ b/paddle/fluid/operators/bernoulli_op.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/bilateral_slice_op.h b/paddle/fluid/operators/bilateral_slice_op.h index 0903fe4c71d3d..3ef13c421cdfb 100644 --- a/paddle/fluid/operators/bilateral_slice_op.h +++ b/paddle/fluid/operators/bilateral_slice_op.h @@ -13,7 +13,7 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu index cf189193d1c11..5964b9e345e93 100644 --- a/paddle/fluid/operators/bincount_op.cu +++ b/paddle/fluid/operators/bincount_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/bincount_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h index ba1c504430223..99d1d7c4776c3 100644 --- a/paddle/fluid/operators/deformable_conv_func.h +++ b/paddle/fluid/operators/deformable_conv_func.h @@ -24,7 +24,7 @@ #pragma once #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" template HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h, diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 39f4fdb71b69d..821b87bf0595a 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index 17013efcc98b7..53727d9d08747 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/box_clip_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index 10c402e5a4078..7102c4cffe21a 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h index e06c81052a0f4..31a67ecc26635 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ b/paddle/fluid/operators/detection/yolo_box_op.h @@ -14,7 +14,7 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index 8bb963979e5a7..a13ae57090687 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" #if !defined(_WIN32) #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 8a6cadc2413dc..daca105ce46bb 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/framework/array.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/pten/core/array.h" #include "paddle/pten/kernels/funcs/elementwise_functor.h" namespace paddle { @@ -92,12 +92,12 @@ using Complex = paddle::platform::complex; template struct DivGradXYFunctor { - inline HOSTDEVICE paddle::framework::Array operator()(const InT a, - const InT b, - const InT c) { + inline HOSTDEVICE pten::framework::Array operator()(const InT a, + const InT b, + const InT c) { // dx = dout / y // dy = - dout * out / y - paddle::framework::Array outs; + pten::framework::Array outs; outs[0] = a / c; outs[1] = -a * b / c; return outs; @@ -106,9 +106,9 @@ struct DivGradXYFunctor { template struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE paddle::framework::Array, 2> operator()( + inline HOSTDEVICE pten::framework::Array, 2> operator()( const Complex a, const Complex b, const Complex c) { - paddle::framework::Array, 2> outs; + pten::framework::Array, 2> outs; Complex c_conj(c.real, -c.imag); Complex out_div_c_conj((b / c).real, -(b / c).imag); outs[0] = a / c_conj; @@ -247,9 +247,9 @@ struct MinGradYFunctor { template struct MinGradXYFunctor { - inline HOSTDEVICE paddle::framework::Array operator()( + inline HOSTDEVICE pten::framework::Array operator()( const InT& x, const InT& y, const InT& dout) { - paddle::framework::Array outs; + pten::framework::Array outs; // dx = dout * (x < y) outs[0] = static_cast(dout * static_cast(x < y)); // dy = dout * (x >= y) @@ -273,10 +273,10 @@ struct MulGradFunctor> { template struct MulGradXYFunctor { - inline HOSTDEVICE paddle::framework::Array operator()(const InT a, - const InT b, - const InT c) { - paddle::framework::Array outs; + inline HOSTDEVICE pten::framework::Array operator()(const InT a, + const InT b, + const InT c) { + pten::framework::Array outs; // dx = dout * y outs[0] = a * b; // dy = dout * x @@ -287,9 +287,9 @@ struct MulGradXYFunctor { template struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE paddle::framework::Array, 2> operator()( + inline HOSTDEVICE pten::framework::Array, 2> operator()( const Complex a, const Complex b, const Complex c) { - paddle::framework::Array, 2> outs; + pten::framework::Array, 2> outs; // dx = dout * y Complex b_conj(b.real, -b.imag); outs[0] = a * b_conj; @@ -316,9 +316,9 @@ struct MaxGradYFunctor { template struct MaxGradXYFunctor { - inline HOSTDEVICE paddle::framework::Array operator()( + inline HOSTDEVICE pten::framework::Array operator()( const InT& x, const InT& y, const InT& dout) { - paddle::framework::Array outs; + pten::framework::Array outs; // dx = dout * (x > y) outs[0] = static_cast(dout * static_cast(x > y)); // dy = dout * (x <= y) diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 21e7079ff6233..c31139611e84c 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -20,8 +20,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/transform.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index da386052c7dc0..a595e5078b21d 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu index 2bf259f7d7a7a..a34f4b8a22e57 100644 --- a/paddle/fluid/operators/histogram_op.cu +++ b/paddle/fluid/operators/histogram_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/histogram_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 93cfba1964684..fbfed71e1ecd4 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index baa292319d36e..0c0dde6bd4536 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -15,7 +15,7 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h index a5afb18b3ff6f..4d6189b57bf1c 100644 --- a/paddle/fluid/operators/interpolate_v2_op.h +++ b/paddle/fluid/operators/interpolate_v2_op.h @@ -15,7 +15,7 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index ce45ed0301e92..45697073cbf85 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -20,6 +20,7 @@ #ifdef PADDLE_WITH_HIP #include #endif +#include "paddle/pten/core/ddim.h" namespace paddle { namespace operators { @@ -85,7 +86,7 @@ struct FastDivMod { template struct BroadcastConfig { FastDivMod divmoders[kDims]; - uint32_t strides[framework::DDim::kMaxRank]; + uint32_t strides[pten::framework::DDim::kMaxRank]; HOSTDEVICE BroadcastConfig() {} HOSTDEVICE BroadcastConfig(const std::vector& out_dims, diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h index 0bc53d7dd7b3b..40199677fe9a3 100644 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ b/paddle/fluid/operators/kldiv_loss_op.h @@ -13,7 +13,7 @@ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu index 3949a066e0868..b758efb065209 100644 --- a/paddle/fluid/operators/lstm_unit_op.cu +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -19,7 +19,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/lstm_unit_op.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h index 3b28928a52892..f5ce5af70bd7a 100644 --- a/paddle/fluid/operators/math.h +++ b/paddle/fluid/operators/math.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" #include "math.h" // NOLINT diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h index 346c693a22d85..cbe1a03d90d85 100644 --- a/paddle/fluid/operators/math/algorithm.h +++ b/paddle/fluid/operators/math/algorithm.h @@ -18,7 +18,7 @@ #include // for int64_t #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h index 3214adb095376..48f16b87cbd66 100644 --- a/paddle/fluid/operators/math/complex_functors.h +++ b/paddle/fluid/operators/math/complex_functors.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/cos_sim_functor.h b/paddle/fluid/operators/math/cos_sim_functor.h index 9a24bfc331266..61827af950bd5 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.h +++ b/paddle/fluid/operators/math/cos_sim_functor.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index db19818951d7c..e7ac1760d3b9c 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index f88b4a6e41cf9..89a1efe133387 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 38bd1a3dadb63..def25a680cb95 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/platform/cpu_info.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/detail/gru_kernel.h b/paddle/fluid/operators/math/detail/gru_kernel.h index d9be8e80658fa..603f5f3426f0d 100644 --- a/paddle/fluid/operators/math/detail/gru_kernel.h +++ b/paddle/fluid/operators/math/detail/gru_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" // TODO(guosheng): refine code style in gru_kernel namespace paddle { diff --git a/paddle/fluid/operators/math/detail/lstm_kernel.h b/paddle/fluid/operators/math/detail/lstm_kernel.h index 003ec194366c9..33dcde4590068 100644 --- a/paddle/fluid/operators/math/detail/lstm_kernel.h +++ b/paddle/fluid/operators/math/detail/lstm_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/operators/math/detail/activation_functions.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 50bddf73bc10c..ceeb85d6d36ef 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 4743f0dc9faf1..f0637a40b8cde 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -20,8 +20,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu index 3c85da3c52c6c..ea08dc8084abf 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cu +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/modified_huber_loss_op.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h index 398676ba74151..4f552edf97bbe 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.h +++ b/paddle/fluid/operators/modified_huber_loss_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/fluid/operators/multinomial_op.h index 14cfbd268389e..df4c2e9e7bbf6 100644 --- a/paddle/fluid/operators/multinomial_op.h +++ b/paddle/fluid/operators/multinomial_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/nll_loss_op.cu b/paddle/fluid/operators/nll_loss_op.cu index 03af45634149d..e3c99afe820c2 100644 --- a/paddle/fluid/operators/nll_loss_op.cu +++ b/paddle/fluid/operators/nll_loss_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/nll_loss_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu index 57986d262820d..7e8e37bd2ee8f 100644 --- a/paddle/fluid/operators/roll_op.cu +++ b/paddle/fluid/operators/roll_op.cu @@ -13,11 +13,11 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/roll_op.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/pten/core/array.h" namespace paddle { namespace operators { @@ -28,9 +28,9 @@ using LoDTensor = framework::LoDTensor; template __global__ void RollCudaKernel(const T* input, T* output, int64_t N, - paddle::framework::Array shifts, - paddle::framework::Array strides, - paddle::framework::Array sizes) { + pten::framework::Array shifts, + pten::framework::Array strides, + pten::framework::Array sizes) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { return; @@ -101,9 +101,9 @@ class RollKernel #define CALL_ROLL_CUDA_KERNEL(N) \ case N: { \ - paddle::framework::Array _strides; \ - paddle::framework::Array _shifts; \ - paddle::framework::Array _sizes; \ + pten::framework::Array _strides; \ + pten::framework::Array _shifts; \ + pten::framework::Array _sizes; \ for (size_t idx = 0; idx < N; ++idx) { \ _strides[idx] = strides[idx]; \ _shifts[idx] = shifts[idx]; \ diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu index cc012230c1062..de29822b8d7fe 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -22,7 +22,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/math.h" #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/smooth_l1_loss_op.h b/paddle/fluid/operators/smooth_l1_loss_op.h index efe3afba18e8f..e30b48b1500ed 100644 --- a/paddle/fluid/operators/smooth_l1_loss_op.h +++ b/paddle/fluid/operators/smooth_l1_loss_op.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h index cfd4d6bce8364..413470e3db5d4 100644 --- a/paddle/fluid/operators/unstack_op.h +++ b/paddle/fluid/operators/unstack_op.h @@ -20,7 +20,6 @@ limitations under the License. */ #if defined(__NVCC__) || defined(__HIPCC__) #include -#include "paddle/fluid/framework/array.h" #endif namespace paddle { diff --git a/paddle/fluid/platform/aligned_vector.h b/paddle/fluid/platform/aligned_vector.h index 7d014f6bdcb0b..144c017414a5d 100644 --- a/paddle/fluid/platform/aligned_vector.h +++ b/paddle/fluid/platform/aligned_vector.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h index 2b3d1693f6245..872a6cf062eef 100644 --- a/paddle/fluid/platform/eigen_ext.h +++ b/paddle/fluid/platform/eigen_ext.h @@ -17,7 +17,7 @@ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index cc9919d8366be..e3a391462878a 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/place.h" +#include "paddle/pten/core/hostdevice.h" #if defined(__NVCC__) || defined(__HIPCC__) #include diff --git a/paddle/fluid/platform/transform_test.cu b/paddle/fluid/platform/transform_test.cu index 23f5865971246..32ec113d1f5e5 100644 --- a/paddle/fluid/platform/transform_test.cu +++ b/paddle/fluid/platform/transform_test.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/transform.h" +#include "paddle/pten/core/hostdevice.h" template class Scale { diff --git a/paddle/pten/api/include/tensor.h b/paddle/pten/api/include/tensor.h index c26c9ce839458..d2afd703eaf2a 100644 --- a/paddle/pten/api/include/tensor.h +++ b/paddle/pten/api/include/tensor.h @@ -42,12 +42,12 @@ class DenseTensor; namespace pten { class TensorBase; +namespace framework { +class DDim; +} // namespace framework } // namespace pten namespace paddle { -namespace framework { -class DDim; -} namespace experimental { @@ -159,9 +159,9 @@ class PADDLE_API Tensor final { /** * @brief Return the dimensions of Tensor. * - * @return paddle::framework::DDim + * @return pten::framework::DDim */ - paddle::framework::DDim dims() const; + pten::framework::DDim dims() const; /** * @brief Return the shape (dimensions) of Tensor. diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index cb70d26f947b8..0ccc9c56dbff7 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -47,13 +47,13 @@ limitations under the License. */ * In the future, the necessary components will be moved to the this library, * or the corresponding components will be re-implemented. */ -#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/stream/cuda_stream.h" #include "paddle/pten/common/complex.h" #include "paddle/pten/common/float16.h" +#include "paddle/pten/core/ddim.h" namespace paddle { namespace experimental { @@ -94,10 +94,10 @@ int64_t Tensor::numel() const { return impl_->numel(); } int64_t Tensor::size() const { return impl_->numel(); } -paddle::framework::DDim Tensor::dims() const { return impl_->dims(); } +pten::framework::DDim Tensor::dims() const { return impl_->dims(); } std::vector Tensor::shape() const { - return paddle::framework::vectorize(impl_->dims()); + return pten::framework::vectorize(impl_->dims()); } void Tensor::reshape(const std::vector &shape) { diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt index facc9ac005662..eabc5a19babad 100644 --- a/paddle/pten/core/CMakeLists.txt +++ b/paddle/pten/core/CMakeLists.txt @@ -15,6 +15,15 @@ cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector) cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base) cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base ) +cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) +cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) +if(WITH_GPU) + nv_test(dim_test SRCS dim_test.cu DEPS ddim) +elseif(WITH_ROCM) + hip_test(dim_test SRCS dim_test.cu DEPS ddim) +endif() + # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) add_dependencies(dense_tensor mkldnn) diff --git a/paddle/fluid/framework/array.h b/paddle/pten/core/array.h similarity index 94% rename from paddle/fluid/framework/array.h rename to paddle/pten/core/array.h index 0ec9cb81129c2..86d222d2d57b3 100644 --- a/paddle/fluid/framework/array.h +++ b/paddle/pten/core/array.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -15,10 +15,12 @@ #pragma once #include -#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/pten/core/unroll_array_ops.h" +// TODO(paddle-dev): Need to modify into pten/core/enforce.h #include "paddle/fluid/platform/enforce.h" -namespace paddle { +namespace pten { +namespace platform = paddle::platform; namespace framework { template @@ -146,4 +148,4 @@ class Array { }; } // namespace framework -} // namespace paddle +} // namespace pten diff --git a/paddle/fluid/framework/ddim.cc b/paddle/pten/core/ddim.cc similarity index 77% rename from paddle/fluid/framework/ddim.cc rename to paddle/pten/core/ddim.cc index 8bac8b7df6d2d..663f92a5bf8d0 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/pten/core/ddim.cc @@ -1,22 +1,22 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/ddim.h" +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/ddim.h" #include -#include "paddle/fluid/platform/enforce.h" -namespace paddle { +namespace pten { +namespace platform = paddle::platform; namespace framework { DDim make_ddim(std::initializer_list dims) { @@ -82,10 +82,13 @@ bool contain_unknown_dim(const DDim& ddim) { DDim slice_ddim(const DDim& dim, int begin, int end) { PADDLE_ENFORCE_EQ( - (begin >= 0 && end <= dim.size()), true, + (begin >= 0 && end <= dim.size()), + true, platform::errors::InvalidArgument( - "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", begin, - end, dim.size())); + "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", + begin, + end, + dim.size())); // Constructor of DDim would check whether end - begin is valid return DDim(dim.Get() + begin, end - begin); } @@ -108,27 +111,34 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) { } DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims) { - PADDLE_ENFORCE_GE(src.size(), 3, + PADDLE_ENFORCE_GE(src.size(), + 3, platform::errors::InvalidArgument( "The rank of src dim should be at least 3 " "in flatten_to_3d, but received %d.", src.size())); - PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()), true, + PADDLE_ENFORCE_EQ((num_row_dims >= 1 && num_row_dims < src.size()), + true, platform::errors::InvalidArgument( "The num_row_dims should be inside [1, %d] " "in flatten_to_3d, but received %d.", - src.size() - 1, num_row_dims)); - PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()), true, + src.size() - 1, + num_row_dims)); + PADDLE_ENFORCE_EQ((num_col_dims >= 2 && num_col_dims <= src.size()), + true, platform::errors::InvalidArgument( "The num_col_dims should be inside [2, %d] " "in flatten_to_3d, but received %d.", - src.size(), num_col_dims)); + src.size(), + num_col_dims)); PADDLE_ENFORCE_GE( - num_col_dims, num_row_dims, + num_col_dims, + num_row_dims, platform::errors::InvalidArgument( "The num_row_dims should be less than num_col_dims in flatten_to_3d," "but received num_row_dims = %d, num_col_dims = %d.", - num_row_dims, num_col_dims)); + num_row_dims, + num_col_dims)); return DDim({product(slice_ddim(src, 0, num_row_dims)), product(slice_ddim(src, num_row_dims, num_col_dims)), @@ -169,13 +179,16 @@ DDim DDim::reshape(const std::vector& shape) const { out_dims.rank_ = shape.size(); for (size_t i = 0; i < shape.size(); ++i) { if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE_LT(static_cast(i), in_dims.size(), + PADDLE_ENFORCE_LT(static_cast(i), + in_dims.size(), platform::errors::InvalidArgument( "Index %d of shape under which the value of 0 " "is stored, must be lower than the number of " "old dimensions. But received shape[%d] = 0, " "dimensions = %d, shape = [%s].", - i, in_dims.size(), in_dims)); + i, + in_dims.size(), + in_dims)); out_dims[i] = in_dims[i]; } else { out_dims[i] = shape[i]; @@ -190,19 +203,23 @@ DDim DDim::transpose(const std::vector& axis) const { size_t axis_size = axis.size(); auto axis_set = std::set(axis.begin(), axis.end()); - PADDLE_ENFORCE_EQ(axis_set.size(), axis_size, + PADDLE_ENFORCE_EQ(axis_set.size(), + axis_size, platform::errors::InvalidArgument( "In an axis array, elements must be unique.")); PADDLE_ENFORCE_EQ( - in_rank, axis_size, + in_rank, + axis_size, platform::errors::InvalidArgument("The input dimension's size " "should be equal to the axis's size. " "But received dimension is %d, " "axis's size is %d", - in_rank, axis_size)); + in_rank, + axis_size)); - PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), axis_size, + PADDLE_ENFORCE_LT(*std::max_element(axis.begin(), axis.end()), + axis_size, platform::errors::InvalidArgument( "Axis values must be ranging from 0 to (dims - 1).")); @@ -214,4 +231,4 @@ DDim DDim::transpose(const std::vector& axis) const { } } // namespace framework -} // namespace paddle +} // namespace pten \ No newline at end of file diff --git a/paddle/pten/core/ddim.h b/paddle/pten/core/ddim.h new file mode 100644 index 0000000000000..148c32481c008 --- /dev/null +++ b/paddle/pten/core/ddim.h @@ -0,0 +1,257 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include + +#include "paddle/pten/core/dim.h" + +namespace pten { +namespace platform = paddle::platform; +namespace framework { + +#define PADDLE_VISIT_DDIM_BASE(rank, callback) \ + case (rank): { \ + constexpr auto kRank = (rank); \ + return (callback); \ + } + +#define PADDLE_VISIT_DDIM(rank, callback) \ + switch (rank) { \ + PADDLE_VISIT_DDIM_BASE(0, callback); \ + PADDLE_VISIT_DDIM_BASE(1, callback); \ + PADDLE_VISIT_DDIM_BASE(2, callback); \ + PADDLE_VISIT_DDIM_BASE(3, callback); \ + PADDLE_VISIT_DDIM_BASE(4, callback); \ + PADDLE_VISIT_DDIM_BASE(5, callback); \ + PADDLE_VISIT_DDIM_BASE(6, callback); \ + PADDLE_VISIT_DDIM_BASE(7, callback); \ + PADDLE_VISIT_DDIM_BASE(8, callback); \ + PADDLE_VISIT_DDIM_BASE(9, callback); \ + default: \ + PADDLE_THROW(platform::errors::Unimplemented( \ + "Invalid dimension to be accessed. Now only supports access to " \ + "dimension 0 to 9, but received dimension is %d.", \ + rank)); \ + } + +template +inline void dynamic_dim_assign(const T1* in, T2* out, int n) { + PADDLE_VISIT_DDIM(n, (static_dim_assign(in, out))); +} + +/** + * \brief A dynamically sized dimension. + * + * The number of dimensions must be between [1, 9]. + */ +class DDim { + public: + constexpr static int kMaxRank = 9; + + DDim() : rank_(1) { dim_[0] = 0; } + + DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); } + + DDim(const int* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + DDim(const int64_t* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + template + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } + + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} + + inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); } + + template + inline DDim& operator=(const Dim& dim) { + rank_ = D; + UnsafeCast() = dim; + return *this; + } + + inline int64_t& operator[](int idx) { return dim_[idx]; } + + inline int64_t operator[](int idx) const { return dim_[idx]; } + + int64_t& at(int idx) { + PADDLE_ENFORCE_GE(idx, + 0, + platform::errors::InvalidArgument( + "Invalid DDim index to be accessed. The valid index " + "is between 0 and %d, but received index is %d.", + rank_, + idx)); + PADDLE_ENFORCE_LT(idx, + rank_, + platform::errors::InvalidArgument( + "Invalid DDim index to be accessed. The valid index " + "is between 0 and %d, but received index is %d.", + rank_, + idx)); + return dim_[idx]; + } + + int64_t at(int idx) const { + PADDLE_ENFORCE_GE(idx, + 0, + platform::errors::InvalidArgument( + "Invalid DDim index to be accessed. The valid index " + "is between 0 and %d, but received index is %d.", + rank_, + idx)); + PADDLE_ENFORCE_LT(idx, + rank_, + platform::errors::InvalidArgument( + "Invalid DDim index to be accessed. The valid index " + "is between 0 and %d, but received index is %d.", + rank_, + idx)); + return dim_[idx]; + } + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } + + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + inline const int64_t* Get() const { return dim_.Get(); } + + inline int64_t* GetMutable() { return dim_.GetMutable(); } + + inline int size() const { return rank_; } + + std::string to_str() const; + + DDim reshape(const std::vector& shape) const; + + DDim transpose(const std::vector& axis) const; + + private: + template + inline Dim& UnsafeCast() { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + template + inline const Dim& UnsafeCast() const { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + inline DDim& CopyFrom(const DDim& ddim) { + PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast())); + } + + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + private: + Dim dim_; + int rank_; +}; + +#undef PADDLE_VISIT_DDIM_BASE +#undef PADDLE_VISIT_DDIM + +/** + * \brief Make a DDim from std::vector + * + * \param dims An vector of ints. Must be sized between [1, 9] + */ +DDim make_ddim(const std::vector& dims); + +DDim make_ddim(const std::vector& dims); + +/** + * \brief Make a DDim from an initializer list + * + * \param dims An initializer list of ints. Must be sized between [1, 9] + * + */ +DDim make_ddim(std::initializer_list dims); + +template +std::vector vectorize(const DDim& ddim) { + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); + return result; +} + +int64_t product(const DDim& ddim); + +bool contain_unknown_dim(const DDim& ddim); + +/** + * \brief Slice a ddim + * + * Slice dim with [begin, end). + * e.g. DDim d = make_ddim({1,2,3,4,5}); + * slice_ddim(d, 1, 3); ====> {2,3} + */ +DDim slice_ddim(const DDim& dim, int begin, int end); + +/** + * \brief What is the length of this dimension? + * + * \param Dynamic dimension to inspect + */ + +int arity(const DDim& ddim); + +std::ostream& operator<<(std::ostream&, const DDim&); + +/** +* \brief Flatten dim to 3d +* e.g., DDim d = mak_ddim({1, 2, 3, 4, 5, 6}) +* flatten_to_3d(d, 2, 4); ===> {1*2, 3*4, 5*6} ===> {2, 12, 30} +*/ +DDim flatten_to_3d(const DDim& src, int num_row_dims, int num_col_dims); + +// Reshape a tensor to a matrix. The matrix's first dimension(column length) +// will be the product of tensor's first `num_col_dims` dimensions. +DDim flatten_to_2d(const DDim& src, int num_col_dims); + +DDim flatten_to_1d(const DDim& src); + +DDim stride(const DDim& ddim); + +DDim stride_numel(const DDim& ddim); +} // namespace framework +} // namespace pten diff --git a/paddle/pten/core/ddim_test.cc b/paddle/pten/core/ddim_test.cc new file mode 100644 index 0000000000000..1903bbfdff135 --- /dev/null +++ b/paddle/pten/core/ddim_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "gtest/gtest.h" +#include "paddle/pten/core/ddim.h" + +TEST(DDim, Equality) { + // construct a DDim from an initialization list + pten::framework::DDim ddim = pten::framework::make_ddim({9, 1, 5}); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // construct a DDim from a vector + std::vector vec({9, 1, 5}); + pten::framework::DDim vddim = pten::framework::make_ddim(vec); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // mutate a DDim + ddim[1] = 2; + EXPECT_EQ(ddim[1], 2); + ddim[0] = 6; + EXPECT_EQ(ddim[0], 6); + + // vectorize a DDim + std::vector res_vec = pten::framework::vectorize(vddim); + EXPECT_EQ(res_vec[0], 9); + EXPECT_EQ(res_vec[1], 1); + EXPECT_EQ(res_vec[2], 5); + pten::framework::Dim<3> d(3, 2, 1); + res_vec = pten::framework::vectorize(pten::framework::DDim(d)); + EXPECT_EQ(res_vec[0], 3); + EXPECT_EQ(res_vec[1], 2); + EXPECT_EQ(res_vec[2], 1); + + // arity of a DDim + EXPECT_EQ(pten::framework::arity(ddim), 3); + EXPECT_EQ(ddim.size(), 3); + + // product of a DDim + EXPECT_EQ(pten::framework::product(vddim), 45); + EXPECT_EQ(pten::framework::product(pten::framework::make_ddim({3, 2, 5, 3})), + 90); + + // slice a DDim + pten::framework::DDim ddim2 = pten::framework::make_ddim({1, 2, 3, 4, 5, 6}); + pten::framework::DDim ss = pten::framework::slice_ddim(ddim2, 2, 5); + EXPECT_EQ(arity(ss), 3); + EXPECT_EQ(ss[0], 3); + EXPECT_EQ(ss[1], 4); + EXPECT_EQ(ss[2], 5); + pten::framework::DDim ss2 = pten::framework::slice_ddim(ddim2, 0, 6); + EXPECT_EQ(arity(ss2), 6); + EXPECT_EQ(ss2[0], 1); + EXPECT_EQ(ss2[1], 2); + EXPECT_EQ(ss2[2], 3); + EXPECT_EQ(ss2[3], 4); + EXPECT_EQ(ss2[4], 5); + EXPECT_EQ(ss2[5], 6); +} + +TEST(DDim, Print) { + // print a DDim + std::stringstream ss; + pten::framework::DDim ddim = pten::framework::make_ddim({2, 3, 4}); + ss << ddim; + EXPECT_EQ("2, 3, 4", ss.str()); +} diff --git a/paddle/pten/core/dim.h b/paddle/pten/core/dim.h new file mode 100644 index 0000000000000..8dd984891a894 --- /dev/null +++ b/paddle/pten/core/dim.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/pten/core/array.h" +#include "paddle/pten/core/hostdevice.h" + +namespace pten { +namespace framework { + +// Statically sized, statically indexed dimension +template +class Dim : public Array { + public: + static_assert(D >= 0, "D must be not less than 0"); + + static constexpr int kRank = D; + using BaseClass = Array; + + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } + + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} + + /** Construct a Dim with each dimension set to the given index */ + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } + + HOSTDEVICE Dim() = default; + + HOST std::string to_string() const; +}; + +// Product of a Dim +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); +} + +/** + * Helper function to create a Dim + * + * \param idxes The type of Dim constructed depends on the number of params + * + */ + +template +HOSTDEVICE inline Dim make_dim(Args... idxes) { + return Dim(idxes...); +} + +// Allows us to output a Dim +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < D; ++i) { + os << ", " << d[i]; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { + return os; +} + +template +HOST std::string Dim::to_string() const { + std::stringstream stream; + stream << *this; + return stream.str(); +} + +template +inline void static_dim_assign(const T1* in, T2* out) { + UnrollAssign::Run(in, out); +} + +} // namespace framework +} // namespace pten diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/pten/core/dim_test.cu similarity index 62% rename from paddle/fluid/framework/dim_test.cu rename to paddle/pten/core/dim_test.cu index b3c26b10c6ffb..0f8d71c5d3b4c 100644 --- a/paddle/fluid/framework/dim_test.cu +++ b/paddle/pten/core/dim_test.cu @@ -1,42 +1,43 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include #include #include "gtest/gtest.h" -#include "paddle/fluid/framework/dim.h" +#include "paddle/pten/core/dim.h" -__global__ void test(paddle::framework::Dim<2>* o) { - o[0] = paddle::framework::make_dim(5, 6); +__global__ void test(pten::framework::Dim<2>* o) { + o[0] = pten::framework::make_dim(5, 6); } __global__ void dyn_idx_gpu(int64_t* o) { - auto d = paddle::framework::make_dim(5, 6); + auto d = pten::framework::make_dim(5, 6); o[0] = d[1]; } TEST(Dim, Equality) { // construct a Dim on the CPU - auto a = paddle::framework::make_dim(3, 4); + auto a = pten::framework::make_dim(3, 4); EXPECT_EQ(a[0], 3); EXPECT_EQ(a[1], 4); // construct a Dim on the GPU - thrust::device_vector> t(2); + thrust::device_vector> t(2); #ifdef PADDLE_WITH_HIP - hipLaunchKernelGGL(test, dim3(1), dim3(1), 0, 0, - thrust::raw_pointer_cast(t.data())); + hipLaunchKernelGGL( + test, dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(t.data())); #else test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); #endif @@ -45,10 +46,10 @@ TEST(Dim, Equality) { EXPECT_EQ(a[1], 6); // product - EXPECT_EQ(paddle::framework::product(a), 30); + EXPECT_EQ(pten::framework::product(a), 30); // mutate a Dim - auto b = paddle::framework::make_dim(7, 8); + auto b = pten::framework::make_dim(7, 8); b[1] = 10; EXPECT_EQ(b[0], 7); EXPECT_EQ(b[1], 10); @@ -61,8 +62,8 @@ TEST(Dim, Equality) { // dynamic access on GPU thrust::device_vector r(1); #ifdef PADDLE_WITH_HIP - hipLaunchKernelGGL(dyn_idx_gpu, dim3(1), dim3(1), 0, 0, - thrust::raw_pointer_cast(r.data())); + hipLaunchKernelGGL( + dyn_idx_gpu, dim3(1), dim3(1), 0, 0, thrust::raw_pointer_cast(r.data())); #else dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); #endif @@ -71,9 +72,9 @@ TEST(Dim, Equality) { } TEST(Dim, Bool) { - auto a = paddle::framework::make_dim(3, 4); - auto b = paddle::framework::make_dim(5, 6); - auto c = paddle::framework::make_dim(3, 4); + auto a = pten::framework::make_dim(3, 4); + auto b = pten::framework::make_dim(5, 6); + auto c = pten::framework::make_dim(3, 4); // comparison EXPECT_TRUE(a == a); @@ -84,13 +85,13 @@ TEST(Dim, Bool) { TEST(Dim, Print) { { std::stringstream ss; - auto a = paddle::framework::make_dim(2, 3); + auto a = pten::framework::make_dim(2, 3); ss << a; EXPECT_EQ(ss.str(), "2, 3"); } { std::stringstream ss; - ss << paddle::framework::make_dim(8); + ss << pten::framework::make_dim(8); EXPECT_EQ(ss.str(), "8"); } -} +} \ No newline at end of file diff --git a/paddle/fluid/platform/hostdevice.h b/paddle/pten/core/hostdevice.h similarity index 89% rename from paddle/fluid/platform/hostdevice.h rename to paddle/pten/core/hostdevice.h index 65005a5adbb1d..08fe3125287d7 100644 --- a/paddle/fluid/platform/hostdevice.h +++ b/paddle/pten/core/hostdevice.h @@ -1,16 +1,17 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // -// http://www.apache.org/licenses/LICENSE-2.0 +// http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #pragma once #ifdef __HIPCC__ diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h index 528a52cee8da4..662553cbcb598 100644 --- a/paddle/pten/core/tensor_base.h +++ b/paddle/pten/core/tensor_base.h @@ -14,11 +14,11 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/platform/place.h" #include "paddle/pten/common/backend.h" #include "paddle/pten/common/data_type.h" #include "paddle/pten/common/layout.h" +#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/storage.h" #include "paddle/pten/core/utils/type_registry.h" @@ -28,7 +28,7 @@ class TensorBase { public: using DataType = paddle::experimental::DataType; using DataLayout = paddle::experimental::DataLayout; - using DDim = paddle::framework::DDim; + using DDim = pten::framework::DDim; using Place = paddle::platform::Place; virtual ~TensorBase() = default; diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h index 2df6b48b674a7..ac3f17267c4f9 100644 --- a/paddle/pten/core/tensor_meta.h +++ b/paddle/pten/core/tensor_meta.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/pten/common/layout.h" // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/ddim.h" +#include "paddle/pten/core/ddim.h" // Note: mixed_vector include many header now, LoD will be // used on CUDA device? Can we use small_vector here? @@ -30,7 +30,7 @@ limitations under the License. */ namespace pten { -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; using LoD = std::vector>; /// \brief The meta data of dense tensor. Take the structure type /// and use all default operations. diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/pten/core/unroll_array_ops.h similarity index 96% rename from paddle/fluid/framework/unroll_array_ops.h rename to paddle/pten/core/unroll_array_ops.h index a9c047cc6c6ac..fb0358375a58e 100644 --- a/paddle/fluid/framework/unroll_array_ops.h +++ b/paddle/pten/core/unroll_array_ops.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,9 +16,9 @@ #include #include -#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/core/hostdevice.h" -namespace paddle { +namespace pten { namespace framework { namespace detail { @@ -130,4 +130,4 @@ template using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; } // namespace framework -} // namespace paddle +} // namespace pten diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/pten/core/unroll_array_ops_test.cc similarity index 92% rename from paddle/fluid/framework/unroll_array_ops_test.cc rename to paddle/pten/core/unroll_array_ops_test.cc index c4fdfdb425f23..f32d94be759be 100644 --- a/paddle/fluid/framework/unroll_array_ops_test.cc +++ b/paddle/pten/core/unroll_array_ops_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/pten/core/unroll_array_ops.h" #include #include -namespace paddle { +namespace pten { namespace framework { template @@ -79,4 +79,4 @@ TEST(unroll_ops, product) { } } // namespace framework -} // namespace paddle +} // namespace pten \ No newline at end of file diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc index ea587806bfcb2..083fb0fca2188 100644 --- a/paddle/pten/infermeta/binary.cc +++ b/paddle/pten/infermeta/binary.cc @@ -64,8 +64,8 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta, const DenseTensorMeta& y_meta, bool trans_x, bool trans_y) { - std::vector dims_x = paddle::framework::vectorize(x_meta.dims); - std::vector dims_y = paddle::framework::vectorize(y_meta.dims); + std::vector dims_x = pten::framework::vectorize(x_meta.dims); + std::vector dims_y = pten::framework::vectorize(y_meta.dims); auto ndims_x = dims_x.size(); auto ndims_y = dims_y.size(); PADDLE_ENFORCE_GT(ndims_x, @@ -125,7 +125,7 @@ DenseTensorMeta MatmulInferMeta(const DenseTensorMeta& x_meta, new_dims.push_back(1); } - auto ddim_out = paddle::framework::make_ddim(new_dims); + auto ddim_out = pten::framework::make_ddim(new_dims); return {x_meta.dtype, ddim_out, x_meta.layout}; } @@ -169,7 +169,7 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta, out_dims_array.data(), max_dim, axis); - return_meta.dims = paddle::framework::make_ddim(out_dims_array); + return_meta.dims = pten::framework::make_ddim(out_dims_array); } return_meta.lod = x_meta.lod; return return_meta; diff --git a/paddle/pten/infermeta/nullary.cc b/paddle/pten/infermeta/nullary.cc index 731e69e60907b..19e11f049fee7 100644 --- a/paddle/pten/infermeta/nullary.cc +++ b/paddle/pten/infermeta/nullary.cc @@ -20,14 +20,14 @@ namespace pten { DenseTensorMeta CreateInferMeta(const std::vector& shape, DataType dtype, DataLayout layout) { - const auto& out_dims = paddle::framework::make_ddim(shape); + const auto& out_dims = pten::framework::make_ddim(shape); return {dtype, out_dims, layout}; } DenseTensorMeta CreateInferMeta(const ScalarArray& shape, DataType dtype, DataLayout layout) { - const auto& out_dims = paddle::framework::make_ddim(shape.GetData()); + const auto& out_dims = pten::framework::make_ddim(shape.GetData()); return {dtype, out_dims, layout}; } diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc index 843a78f3413cf..27e1dc9511df2 100644 --- a/paddle/pten/infermeta/unary.cc +++ b/paddle/pten/infermeta/unary.cc @@ -23,7 +23,7 @@ DenseTensorMeta UnchangedInferMeta(const DenseTensorMeta& x_meta) { } DenseTensorMeta ReductionInferMeta(const DenseTensorMeta& x_meta) { - const auto& out_dims = paddle::framework::make_ddim({1}); + const auto& out_dims = pten::framework::make_ddim({1}); DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout); return return_meta; } @@ -63,7 +63,7 @@ DenseTensorMeta FlattenInferMeta(const DenseTensorMeta& x_meta, for (int i = stop_axis + 1; i < in_dims_size; i++) { out_shape.push_back(x_dims[i]); } - const auto& out_dims = paddle::framework::make_ddim(out_shape); + const auto& out_dims = pten::framework::make_ddim(out_shape); DenseTensorMeta return_meta(x_meta.dtype, out_dims, x_meta.layout); if (x_dims[0] == return_meta.dims[0]) { @@ -89,10 +89,10 @@ DenseTensorMeta CreateLikeInferMeta(const DenseTensorMeta& x_meta, layout == DataLayout::UNDEFINED ? x_meta.layout : layout}; } -static paddle::framework::DDim ValidateShape( - const std::vector shape, const paddle::framework::DDim& in_dims) { - const int64_t in_size = paddle::framework::product(in_dims); - auto in_dims_vec = paddle::framework::vectorize(in_dims); +static pten::framework::DDim ValidateShape( + const std::vector shape, const pten::framework::DDim& in_dims) { + const int64_t in_size = pten::framework::product(in_dims); + auto in_dims_vec = pten::framework::vectorize(in_dims); bool all_positive = std::all_of(in_dims_vec.cbegin(), in_dims_vec.cend(), [](int64_t i) { return i > 0; }); @@ -112,7 +112,7 @@ static paddle::framework::DDim ValidateShape( paddle::platform::errors::InvalidArgument( "Only one dimension value of 'shape' in ReshapeOp can " "be -1. But received shape = [%s], shape[%d] is also -1.", - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), i)); unk_dim_idx = i; } else if (shape[i] == copy_dim_val) { @@ -124,7 +124,7 @@ static paddle::framework::DDim ValidateShape( "the input tensor X's dimensions. " "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " "X's dimensions = %d.", - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), i, in_dims, in_dims.size())); @@ -136,7 +136,7 @@ static paddle::framework::DDim ValidateShape( "Each dimension value of 'shape' in ReshapeOp must not " "be negative except one unknown dimension. " "But received shape = [%s], shape[%d] = %d.", - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), i, shape[i])); } @@ -165,7 +165,7 @@ static paddle::framework::DDim ValidateShape( "'shape' is [%s], known capacity of 'shape' is %d.", in_dims, in_size, - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), capacity)); } else { output_shape[unk_dim_idx] = -1; @@ -183,7 +183,7 @@ static paddle::framework::DDim ValidateShape( "[%s], the capacity of 'shape' is %d.", in_dims, in_size, - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), capacity)); } } @@ -202,11 +202,11 @@ static paddle::framework::DDim ValidateShape( "capacity of 'Out' is %d.", in_dims, in_size, - paddle::framework::make_ddim(shape), + pten::framework::make_ddim(shape), capacity)); } - return paddle::framework::make_ddim(output_shape); + return pten::framework::make_ddim(output_shape); } DenseTensorMeta InferMetaFromVecValue(const DenseTensorMeta& x_meta, @@ -267,7 +267,7 @@ DenseTensorMeta ReduceInferMeta(const DenseTensorMeta& x_meta, out_dim_vector.push_back(1); } } - DDim out_dim = paddle::framework::make_ddim(out_dim_vector); + DDim out_dim = pten::framework::make_ddim(out_dim_vector); DataType out_dtype; if (dtype != DataType::UNDEFINED) { diff --git a/paddle/pten/kernels/cpu/elementwise.h b/paddle/pten/kernels/cpu/elementwise.h index f048678111cf2..e4f426d3f8eb4 100644 --- a/paddle/pten/kernels/cpu/elementwise.h +++ b/paddle/pten/kernels/cpu/elementwise.h @@ -583,8 +583,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext& ctx, } VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" - << paddle::framework::make_ddim(x_dims_array) - << " ydim:" << paddle::framework::make_ddim(y_dims_array); + << pten::framework::make_ddim(x_dims_array) + << " ydim:" << pten::framework::make_ddim(y_dims_array); CommonGradBroadcastCPU(x, y, diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index b38f17aa02a55..86443c254bf67 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -50,13 +50,13 @@ void ReduceFunctor(const DeviceContext& context, DDim out_dims = output->dims(); if (keep_dim && x_rank > 1) { const int kDelFlag = -2; - auto dims_vector = paddle::framework::vectorize(out_dims); + auto dims_vector = pten::framework::vectorize(out_dims); for (size_t i = 0; i < dims_ref.size(); ++i) { dims_vector[dims_ref[i]] = kDelFlag; } dims_vector.erase(remove(dims_vector.begin(), dims_vector.end(), kDelFlag), dims_vector.end()); - out_dims = paddle::framework::make_ddim(dims_vector); + out_dims = pten::framework::make_ddim(dims_vector); } auto& place = *context.eigen_device(); Functor functor; diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index d6a155dca0176..2deac0146c52c 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -24,7 +24,7 @@ template void EmptyKernel(const Context& dev_ctx, const ScalarArray& shape, DenseTensor* out) { - out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData())); + out->ResizeAndAllocate(pten::framework::make_ddim(shape.GetData())); } template diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc index e45ac516e16ed..cbbf62f1993e2 100644 --- a/paddle/pten/kernels/flatten_grad_kernel.cc +++ b/paddle/pten/kernels/flatten_grad_kernel.cc @@ -25,8 +25,7 @@ void FlattenGradKernel(const Context& dev_ctx, const DenseTensor& xshape, DenseTensor* x_grad) { auto xshape_dims = xshape.dims(); - auto x_dims = - paddle::framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + auto x_dims = pten::framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); pten::Copy(dev_ctx, out_grad, false, x_grad); x_grad->ResizeAndAllocate(x_dims); } diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index 8693fd2b36c4e..6bb45ad199510 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -26,7 +26,7 @@ inline void SetXShape(const DenseTensor &x, DenseTensor *xshape) { for (int i = 0; i < in_dims.size(); ++i) { xshape_dims[i + 1] = in_dims[i]; } - xshape->ResizeAndAllocate(paddle::framework::make_ddim(xshape_dims)); + xshape->ResizeAndAllocate(pten::framework::make_ddim(xshape_dims)); xshape->ResetLoD(x.meta().lod); } diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 7396c64de9eab..47924c4e2ae18 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -36,10 +36,10 @@ enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3, kAny = -1 }; for supporting multiple-output feature in elementwise system.*/ template using ConditionalT = - typename std::conditional_t>; + typename std::conditional_t>; namespace funcs { -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; template struct ElemwiseGradNoBroadcast { @@ -303,9 +303,9 @@ inline DDim trim_trailing_singular_dims(const DDim &dims) { trim_dims[i] = dims[i]; } if (trim_dims.size() == 0) { - return DDim(paddle::framework::make_dim()); + return DDim(pten::framework::make_dim()); } - DDim actual_dims = paddle::framework::make_ddim(trim_dims); + DDim actual_dims = pten::framework::make_ddim(trim_dims); return actual_dims; } @@ -377,7 +377,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, DenseTensor *dy, DX_OP dx_op, DY_OP dy_op) { - size_t N = static_cast(paddle::framework::product(x_dim)); + size_t N = static_cast(pten::framework::product(x_dim)); paddle::platform::ForRange for_range(dev_ctx, N); for_range(ElemwiseGradNoBroadcast{ x.data(), @@ -462,7 +462,7 @@ struct ElementwisePrimitiveCaller { template struct ElementwiseWriteDataCaller { __device__ __forceinline__ void operator()( - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs, + pten::framework::Array<_ptr_ OutT *, NumOuts> outs, ConditionalT src[VecSize], int block_offset, int num) { @@ -485,7 +485,7 @@ struct ElementwiseWriteDataCaller { template struct ElementwiseWriteDataCaller { __device__ __forceinline__ void operator()( - paddle::framework::Array<_ptr_ OutT *, 1> outs, + pten::framework::Array<_ptr_ OutT *, 1> outs, OutT src[VecSize], int block_offset, int num) { @@ -502,8 +502,8 @@ template __device__ void VectorizedElementwiseKernelImpl( - const paddle::framework::Array &in, - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs, + const pten::framework::Array &in, + pten::framework::Array<_ptr_ OutT *, NumOuts> outs, int num, int data_offset, Functor func) { @@ -537,8 +537,8 @@ template __global__ void VectorizedElementwiseKernel( - paddle::framework::Array ins, - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs, + pten::framework::Array ins, + pten::framework::Array<_ptr_ OutT *, NumOuts> outs, int size, int main_offset, Functor func) { @@ -578,8 +578,8 @@ void ElementwiseCudaKernel(const KPDevice &ctx, std::vector *outs, Functor func) { auto numel = ins[0]->numel(); - paddle::framework::Array ins_data; - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data; + pten::framework::Array ins_data; + pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data; for (int i = 0; i < Arity; ++i) { ins_data[i] = ins[i]->data(); diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h index 6b89902456ac8..6d139d68530be 100644 --- a/paddle/pten/kernels/funcs/elementwise_functor.h +++ b/paddle/pten/kernels/funcs/elementwise_functor.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/hostdevice.h" #include "paddle/pten/common/float16.h" +#include "paddle/pten/core/hostdevice.h" namespace pten { namespace funcs { diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc index 77d26fcbc3536..90a6859a85091 100644 --- a/paddle/pten/kernels/funcs/transpose.cc +++ b/paddle/pten/kernels/funcs/transpose.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/pten/kernels/funcs/transpose.h" -#include "paddle/fluid/framework/ddim.h" #include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/dense_tensor.h" // See Note [ Why still include the fluid headers? ] @@ -33,8 +33,8 @@ struct TransposeNormal { pten::DenseTensor* out, const std::vector& axis) { const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); + auto in_stride = pten::framework::stride(in.dims()); + auto out_stride = pten::framework::stride(out->dims()); const T* in_ptr = in.data(); T* out_ptr = out->mutable_data(); diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu index 045bfdbdb051c..474a7c4ea4de9 100644 --- a/paddle/pten/kernels/funcs/transpose.cu +++ b/paddle/pten/kernels/funcs/transpose.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/kernels/funcs/transpose.h" @@ -58,8 +58,8 @@ struct TransposeNormal { pten::DenseTensor* out, const std::vector& axis) { const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); + auto in_stride = pten::framework::stride(in.dims()); + auto out_stride = pten::framework::stride(out->dims()); auto* in_ptr = in.data(); auto* out_ptr = out->mutable_data(); diff --git a/paddle/pten/kernels/funcs/transpose.h b/paddle/pten/kernels/funcs/transpose.h index d0e4dafe2c3b8..0cb2b4289fe6e 100644 --- a/paddle/pten/kernels/funcs/transpose.h +++ b/paddle/pten/kernels/funcs/transpose.h @@ -14,7 +14,7 @@ #pragma once -#include "paddle/fluid/framework/ddim.h" +#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/fluid/operators/eigen/eigen_function.h" diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index c3ff91e7b15cd..def54e24840e7 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -130,14 +130,14 @@ struct DimensionsTransform { public: explicit DimensionsTransform(const std::vector &ins, - const paddle::framework::DDim &dims, + const pten::framework::DDim &dims, int axis) { const int N = ins.size(); dim_size = dims.size(); - out_dims = paddle::framework::vectorize(dims); + out_dims = pten::framework::vectorize(dims); in_dims.resize(N); for (int j = 0; j < N; ++j) { - in_dims[j] = paddle::framework::vectorize(ins[j]->dims()); + in_dims[j] = pten::framework::vectorize(ins[j]->dims()); } InputDimensionsExtend(N, axis); @@ -214,11 +214,11 @@ template __device__ void ElementwiseBroadcastKernelImpl( - const paddle::framework::Array &ins, - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs, - const paddle::framework::Array &use_broadcast, + const pten::framework::Array &ins, + pten::framework::Array<_ptr_ OutT *, NumOuts> outs, + const pten::framework::Array &use_broadcast, uint32_t numel, - const paddle::framework::Array, Arity> + const pten::framework::Array, Arity> &configs, int num, int block_offset, @@ -259,12 +259,11 @@ template __global__ void ElementwiseBroadcastKernel( - paddle::framework::Array ins, - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs, - paddle::framework::Array use_broadcast, + pten::framework::Array ins, + pten::framework::Array<_ptr_ OutT *, NumOuts> outs, + pten::framework::Array use_broadcast, uint32_t numel, - paddle::framework::Array, Arity> - configs, + pten::framework::Array, Arity> configs, int main_offset, int tail_tid, Functor func) { @@ -345,10 +344,10 @@ void LaunchKernel(const KPDevice &ctx, Functor func, DimensionsTransform merge_dims) { int numel = (*outs)[0]->numel(); - paddle::framework::Array, Arity> configs; - paddle::framework::Array use_broadcast; - paddle::framework::Array ins_data; - paddle::framework::Array<_ptr_ OutT *, NumOuts> outs_data; + pten::framework::Array, Arity> configs; + pten::framework::Array use_broadcast; + pten::framework::Array ins_data; + pten::framework::Array<_ptr_ OutT *, NumOuts> outs_data; for (int i = 0; i < NumOuts; ++i) { outs_data[i] = (*outs)[i]->mutable_data(); @@ -444,7 +443,7 @@ void LaunchBroadcastKernelForDifferentVecSize( "The maximum dimension of input tensor is expected to be less than " "%d, but recieved %d.\n", merge_dims.dim_size, - paddle::framework::DDim::kMaxRank)); + pten::framework::DDim::kMaxRank)); } } #undef CALL_BROADCAST_FOR_DIM_SIZE @@ -1826,8 +1825,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx, } VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" - << paddle::framework::make_ddim(x_dims_array) - << " ydim:" << paddle::framework::make_ddim(y_dims_array); + << pten::framework::make_ddim(x_dims_array) + << " ydim:" << pten::framework::make_ddim(y_dims_array); CommonGradBroadcastCUDA(x, y, diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index e7d1d2d5f44fc..e247f786cc68d 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -32,7 +32,6 @@ namespace cub = hipcub; #endif -#include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" @@ -41,6 +40,7 @@ namespace cub = hipcub; #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/pten/core/array.h" #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" @@ -118,7 +118,7 @@ static inline void CheckReduceRank(int reduce_rank, int rank) { // convert dims from vector to array template -static inline paddle::framework::Array VectorToArray( +static inline pten::framework::Array VectorToArray( const VectorLikeType& vec) { PADDLE_ENFORCE_LE(vec.size(), ElementCount, @@ -128,7 +128,7 @@ static inline paddle::framework::Array VectorToArray( vec.size(), ElementCount)); size_t n = static_cast(vec.size()); - paddle::framework::Array ret; + pten::framework::Array ret; for (size_t i = 0; i < n; ++i) { ret[i] = vec[i]; } @@ -162,7 +162,7 @@ static inline std::vector GetReduceDim(const std::vector& dims, } // namespace details -constexpr int kMaxRank = paddle::framework::DDim::kMaxRank; +constexpr int kMaxRank = pten::framework::DDim::kMaxRank; enum ReduceType { kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; @@ -202,9 +202,9 @@ struct IndexCalculator { } int dim; - paddle::framework::Array dims; - paddle::framework::Array strides; - paddle::framework::Array divmoders; + pten::framework::Array dims; + pten::framework::Array strides; + pten::framework::Array divmoders; }; template @@ -326,7 +326,7 @@ struct ReduceConfig { const paddle::platform::Place& place, pten::DenseTensor* tmp) { if (should_reduce_again) { - tmp->ResizeAndAllocate(paddle::framework::make_ddim( + tmp->ResizeAndAllocate(pten::framework::make_ddim( {static_cast(left_num * grid.z * grid.y * sizeof(Ty))})); output_data = tmp->mutable_data(); } else { @@ -1029,7 +1029,7 @@ static pten::DenseTensor tmp = pten::DenseTensor( pten::make_intrusive(place), pten::DenseTensorMeta(pten::DataType::UINT8, - paddle::framework::make_ddim( + pten::framework::make_ddim( {static_cast(temp_storage_bytes)}))); auto* temp_storage = tmp.mutable_data(); @@ -1073,7 +1073,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x, // Allocate memory y->mutable_data(); - auto x_dim = paddle::framework::vectorize(x.dims()); + auto x_dim = pten::framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); config.Run(); int numel = x.numel(); diff --git a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h index 39cdbad5146de..557f6fae7b7f9 100644 --- a/paddle/pten/kernels/impl/dot_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/dot_grad_kernel_impl.h @@ -103,7 +103,7 @@ struct DotGradFunctionmutable_data(); const auto* data_y = tensor_y->data(); const DDim& dim = tensor_x->dims(); - size_t N = static_cast(paddle::framework::product(dim)); + size_t N = static_cast(pten::framework::product(dim)); auto step = dim[dim.size() - 1]; @@ -118,7 +118,7 @@ struct DotGradFunctionmutable_data(); const auto* data_x = tensor_x->data(); const DDim& dim = tensor_y->dims(); - size_t N = static_cast(paddle::framework::product(dim)); + size_t N = static_cast(pten::framework::product(dim)); auto step = dim[dim.size() - 1]; diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 134a815799de6..2900e2e83bd65 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -36,7 +36,7 @@ void FullKernel(const Context& dev_ctx, const ScalarArray& shape, const Scalar& val, DenseTensor* out) { - out->ResizeAndAllocate(paddle::framework::make_ddim(shape.GetData())); + out->ResizeAndAllocate(pten::framework::make_ddim(shape.GetData())); FullValue(dev_ctx, out, val.to()); } diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index b1bae78ddc5fa..71fadfae7deb8 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -135,7 +135,7 @@ static DDim RowMatrixFromVector(const DDim& x_dim) { if (x_dim.size() > 1) { return x_dim; } - return paddle::framework::make_ddim({1, x_dim[0]}); + return pten::framework::make_ddim({1, x_dim[0]}); } /** @@ -146,7 +146,7 @@ static DDim ColumnMatrixFromVector(const DDim& y_dim) { if (y_dim.size() > 1) { return y_dim; } - return paddle::framework::make_ddim({y_dim[0], 1}); + return pten::framework::make_ddim({y_dim[0], 1}); } /** diff --git a/paddle/pten/kernels/impl/matmul_kernel_impl.h b/paddle/pten/kernels/impl/matmul_kernel_impl.h index 5ea9729655ecc..afe6bf71e2f6b 100644 --- a/paddle/pten/kernels/impl/matmul_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h @@ -164,7 +164,7 @@ void MatMulFunction(const Context& dev_ctx, std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin()); out_dims.back() = y_dims.back(); } - Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims)); + Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); Out->mutable_data(); if (trans_y) { const int M = Y.numel() / N; @@ -242,7 +242,7 @@ void MatMulFunction(const Context& dev_ctx, } else { std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin()); } - Out->ResizeAndAllocate(paddle::framework::make_ddim(out_dims)); + Out->ResizeAndAllocate(pten::framework::make_ddim(out_dims)); Out->mutable_data(); if (trans_x) { @@ -330,7 +330,7 @@ void MatMulFunction(const Context& dev_ctx, out_broadcast_dims[ndim - 2] = M; out_broadcast_dims[ndim - 1] = N; - Out->ResizeAndAllocate(paddle::framework::make_ddim(out_broadcast_dims)); + Out->ResizeAndAllocate(pten::framework::make_ddim(out_broadcast_dims)); Out->mutable_data(); const int batch_dim = ndim - 2; @@ -493,12 +493,12 @@ void MatmulKernel(const Context& dev_ctx, bool transpose_x, bool transpose_y, DenseTensor* out) { - PADDLE_ENFORCE_NE(paddle::framework::product(x.dims()), + PADDLE_ENFORCE_NE(pten::framework::product(x.dims()), 0, paddle::platform::errors::InvalidArgument( "The Input(X) dims size must not be equal 0," " but reviced dims size is 0. ")); - PADDLE_ENFORCE_NE(paddle::framework::product(y.dims()), + PADDLE_ENFORCE_NE(pten::framework::product(y.dims()), 0, paddle::platform::errors::InvalidArgument( "The Input(Y) dims size must not be equal 0," diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc index 6608d1ed08cab..0a3b56e3f18d4 100644 --- a/paddle/pten/tests/api/test_cast_api.cc +++ b/paddle/pten/tests/api/test_cast_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, cast) { diff --git a/paddle/pten/tests/api/test_conj_api.cc b/paddle/pten/tests/api/test_conj_api.cc index 50d190257a16d..c17b0f23f4f6b 100644 --- a/paddle/pten/tests/api/test_conj_api.cc +++ b/paddle/pten/tests/api/test_conj_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, conj) { diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc index 40e709b960334..97616d0cbcd57 100644 --- a/paddle/pten/tests/api/test_dot_api.cc +++ b/paddle/pten/tests/api/test_dot_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, dot) { diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc index 69af32eb457a6..17a6ffde9df0a 100644 --- a/paddle/pten/tests/api/test_elementwise_api.cc +++ b/paddle/pten/tests/api/test_elementwise_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, add) { diff --git a/paddle/pten/tests/api/test_empty_api.cc b/paddle/pten/tests/api/test_empty_api.cc index f4e3f472c7990..f38e91b02b705 100644 --- a/paddle/pten/tests/api/test_empty_api.cc +++ b/paddle/pten/tests/api/test_empty_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, empty_like) { diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc index 0d823765680e8..7910cc840f5ef 100644 --- a/paddle/pten/tests/api/test_fill_api.cc +++ b/paddle/pten/tests/api/test_fill_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, full_like) { diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc index 6c082b9653e6f..cf8fa9cb1895f 100644 --- a/paddle/pten/tests/api/test_flatten_api.cc +++ b/paddle/pten/tests/api/test_flatten_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, flatten) { diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc index 03f686f1c3f5e..08e0e888b99ed 100644 --- a/paddle/pten/tests/api/test_matmul_api.cc +++ b/paddle/pten/tests/api/test_matmul_api.cc @@ -26,7 +26,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(API, matmul_cpu) { // 1. create tensor diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc index 9d90e58101cbd..a7b85cff12cc1 100644 --- a/paddle/pten/tests/api/test_mean_api.cc +++ b/paddle/pten/tests/api/test_mean_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, mean) { diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc index 59e9e9fab1122..bfd1ea841443f 100644 --- a/paddle/pten/tests/api/test_reshape_api.cc +++ b/paddle/pten/tests/api/test_reshape_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, reshape) { diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc index 5ad52142765ba..bb5523d26c4e1 100644 --- a/paddle/pten/tests/api/test_scale_api.cc +++ b/paddle/pten/tests/api/test_scale_api.cc @@ -24,7 +24,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; void CheckScaleResult(experimental::Tensor* out) { ASSERT_EQ(out->dims().size(), 2); diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc index 5a7c9840e1114..c0d5a89eeb744 100644 --- a/paddle/pten/tests/api/test_sum_api.cc +++ b/paddle/pten/tests/api/test_sum_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, sum) { diff --git a/paddle/pten/tests/api/test_to_api.cc b/paddle/pten/tests/api/test_to_api.cc index 9aef716029a69..fa999aace6678 100644 --- a/paddle/pten/tests/api/test_to_api.cc +++ b/paddle/pten/tests/api/test_to_api.cc @@ -25,7 +25,7 @@ namespace paddle { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; paddle::experimental::Tensor CreateInputTensor() { const auto alloc = std::make_unique( diff --git a/paddle/pten/tests/kernels/test_cast_dev_api.cc b/paddle/pten/tests/kernels/test_cast_dev_api.cc index 80328d0b243e8..3b1412a8e5f4e 100644 --- a/paddle/pten/tests/kernels/test_cast_dev_api.cc +++ b/paddle/pten/tests/kernels/test_cast_dev_api.cc @@ -28,7 +28,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, cast) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_conj_dev_api.cc b/paddle/pten/tests/kernels/test_conj_dev_api.cc index 6f2ea0602b81d..51066d8ae4783 100644 --- a/paddle/pten/tests/kernels/test_conj_dev_api.cc +++ b/paddle/pten/tests/kernels/test_conj_dev_api.cc @@ -26,7 +26,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, conj) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_copy_dev_api.cc b/paddle/pten/tests/kernels/test_copy_dev_api.cc index d690b29d71f6f..4f8bd727716ce 100644 --- a/paddle/pten/tests/kernels/test_copy_dev_api.cc +++ b/paddle/pten/tests/kernels/test_copy_dev_api.cc @@ -26,7 +26,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized // in 'paddle/api' diff --git a/paddle/pten/tests/kernels/test_creation_dev_api.cc b/paddle/pten/tests/kernels/test_creation_dev_api.cc index b1c23d4a768e6..1aa21b847fac4 100644 --- a/paddle/pten/tests/kernels/test_creation_dev_api.cc +++ b/paddle/pten/tests/kernels/test_creation_dev_api.cc @@ -27,7 +27,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, empty) { // 1. create input diff --git a/paddle/pten/tests/kernels/test_dot_dev_api.cc b/paddle/pten/tests/kernels/test_dot_dev_api.cc index 4213240f57ba8..e4978d84c835c 100644 --- a/paddle/pten/tests/kernels/test_dot_dev_api.cc +++ b/paddle/pten/tests/kernels/test_dot_dev_api.cc @@ -26,7 +26,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, dot) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index 23583a843561b..0bc16371c0731 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -26,7 +26,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, add) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_flatten_dev_api.cc b/paddle/pten/tests/kernels/test_flatten_dev_api.cc index 13fc327b66945..78cd6261c3a41 100644 --- a/paddle/pten/tests/kernels/test_flatten_dev_api.cc +++ b/paddle/pten/tests/kernels/test_flatten_dev_api.cc @@ -36,7 +36,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, flatten) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_matmul_dev_api.cc b/paddle/pten/tests/kernels/test_matmul_dev_api.cc index 118215db505d5..76f7750319210 100644 --- a/paddle/pten/tests/kernels/test_matmul_dev_api.cc +++ b/paddle/pten/tests/kernels/test_matmul_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, dot) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_mean_dev_api.cc b/paddle/pten/tests/kernels/test_mean_dev_api.cc index a8860540fd0c9..07ec30afad5ca 100644 --- a/paddle/pten/tests/kernels/test_mean_dev_api.cc +++ b/paddle/pten/tests/kernels/test_mean_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, mean) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_reshape_dev_api.cc b/paddle/pten/tests/kernels/test_reshape_dev_api.cc index 52038593d7012..dc90043305ca0 100644 --- a/paddle/pten/tests/kernels/test_reshape_dev_api.cc +++ b/paddle/pten/tests/kernels/test_reshape_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(DEV_API, reshape) { diff --git a/paddle/pten/tests/kernels/test_scale_dev_api.cc b/paddle/pten/tests/kernels/test_scale_dev_api.cc index 1c0be6c06aacd..106835a204c65 100644 --- a/paddle/pten/tests/kernels/test_scale_dev_api.cc +++ b/paddle/pten/tests/kernels/test_scale_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, scale) { // 1. create tensor diff --git a/paddle/pten/tests/kernels/test_sum_dev_api.cc b/paddle/pten/tests/kernels/test_sum_dev_api.cc index 2b11ba9595c53..41d694a025f42 100644 --- a/paddle/pten/tests/kernels/test_sum_dev_api.cc +++ b/paddle/pten/tests/kernels/test_sum_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, sum) { // 1. create tensor From 4f1fef60b175d5d5b19a2f2cdc8487888f8a1b9a Mon Sep 17 00:00:00 2001 From: TTerror Date: Fri, 21 Jan 2022 15:18:01 +0800 Subject: [PATCH 07/15] refactor unittest for kunlun (#38772) * refactor unittests for kunlun * refactor unittests for kunlun, test=kunlun --- .../fluid/platform/device/xpu/xpu_op_list.cc | 26 ++ .../fluid/platform/device/xpu/xpu_op_list.h | 8 + paddle/fluid/pybind/pybind.cc | 8 + paddle/scripts/paddle_build.sh | 3 + .../unittests/xpu/get_test_cover_info.py | 242 ++++++++++++++ .../unittests/xpu/test_refactor_op_xpu.py | 297 ++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.cc b/paddle/fluid/platform/device/xpu/xpu_op_list.cc index 448559a9edfee..36be4a55d0a6f 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.cc @@ -74,6 +74,32 @@ bool is_in_xpu_black_list(const std::string& op_name) { return false; } +std::vector get_xpu_op_support_type(const std::string& op_name, + XPUVersion version) { + std::vector res; + auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + if (ops.find(op_name) != ops.end()) { + XPUKernelSet& type_set = ops[op_name]; + for (auto& item : type_set) { + res.push_back(item.data_type_); + } + } + return res; +} + +XPUOpListMap get_xpu_op_list(XPUVersion version) { + XPUOpListMap res; + auto& ops = version == XPU1 ? get_kl1_ops() : get_kl2_ops(); + for (auto& op : ops) { + std::vector op_vartypes; + for (auto& item : op.second) { + op_vartypes.push_back(item.data_type_); + } + res[op.first] = std::move(op_vartypes); + } + return res; +} + } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/platform/device/xpu/xpu_op_list.h b/paddle/fluid/platform/device/xpu/xpu_op_list.h index 705f701e13634..3672d68492a6f 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_list.h @@ -12,6 +12,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include +#include #include "paddle/fluid/framework/op_kernel_type.h" @@ -19,10 +20,17 @@ namespace paddle { namespace platform { using pOpKernelType = paddle::framework::OpKernelType; +using vartype = paddle::framework::proto::VarType; +using XPUOpListMap = + std::unordered_map>; bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); bool is_in_xpu_black_list(const std::string& op_name); +std::vector get_xpu_op_support_type(const std::string& op_name, + XPUVersion version); +XPUOpListMap get_xpu_op_list(XPUVersion version); + } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 176db6b48c5ed..cd999f17f3a2f 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -129,6 +129,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/device/xpu/xpu_info.h" +#include "paddle/fluid/platform/device/xpu/xpu_op_list.h" #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" @@ -1762,6 +1763,13 @@ All parameter, weight, gradient are variables in Paddle. m.def("get_xpu_device_count", platform::GetXPUDeviceCount); m.def("get_xpu_device_version", [](int device_id) { return platform::get_xpu_version(device_id); }); + m.def("get_xpu_device_op_support_types", + [](const std::string &op_name, platform::XPUVersion version) { + return platform::get_xpu_op_support_type(op_name, version); + }); + m.def("get_xpu_device_op_list", [](platform::XPUVersion version) { + return platform::get_xpu_op_list(version); + }); m.def("is_float16_supported", [](const platform::XPUPlace &place) -> bool { // XPUs with Compute Capability > xpu2 support float16 and bfloat16 return platform::get_xpu_version(place.device) > platform::XPUVersion::XPU1; diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index cf326a68e5948..384dfbf558f42 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1725,6 +1725,7 @@ function parallel_test_base_xpu() { EOF set +x + export XPU_OP_LIST_DIR=$tmp_dir ut_startTime_s=`date +%s` test_cases=$(ctest -N -V | grep "_xpu" ) # cases list which would be run exclusively get_quickly_disable_ut||disable_ut_quickly='disable_ut' # indicate whether the case was in quickly disable list @@ -1747,6 +1748,8 @@ set -x if [[ "$EXIT_CODE" != "0" ]]; then exit 8; fi + python ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py + unset XPU_OP_LIST_DIR fi } diff --git a/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py new file mode 100644 index 0000000000000..31246436efae2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/get_test_cover_info.py @@ -0,0 +1,242 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import inspect +import os +import fcntl + +import paddle +import paddle.fluid.core as core + +type_dict_paddle_to_str = { + paddle.bool: 'bool', + paddle.uint8: 'uint8', + paddle.int8: 'int8', + paddle.int16: 'int16', + paddle.int32: 'int32', + paddle.int64: 'int64', + paddle.float16: 'float16', + paddle.float32: 'float32', + paddle.float64: 'float64', + paddle.complex128: 'complex128', + paddle.complex64: 'complex64', +} + +type_dict_str_to_paddle = { + 'int32': paddle.int32, + 'int64': paddle.int64, + 'float32': paddle.float32, + 'float16': paddle.float16, + 'bool': paddle.bool, + 'uint8': paddle.uint8, + 'int8': paddle.int8, + 'complex128': paddle.complex128, + 'complex64': paddle.complex64, + 'int16': paddle.int16, +} + +xpu_test_op_white_list = [] +xpu_test_type_white_list = [] +xpu_test_op_type_white_list = [] +xpu_test_device_op_white_list = [] +xpu_test_device_op_type_white_list = [] + + +class XPUOpTestWrapper(object): + def create_classes(self): + base_class = None + classes = [] + return base_class, classes + + +def get_op_white_list(): + op_white_list = xpu_test_op_white_list + if os.getenv('XPU_TEST_OP_WHITE_LIST') is not None: + op_white_list.extend( + os.getenv('XPU_TEST_OP_WHITE_LIST').strip().split(',')) + return list(set(op_white_list)) + + +def get_type_white_list(): + type_white_list = xpu_test_type_white_list + if os.getenv('XPU_TEST_TYPE_WHITE_LIST') is not None: + type_white_list.extend( + os.getenv('XPU_TEST_TYPE_WHITE_LIST').strip().split(',')) + return list(set(type_white_list)) + + +def get_op_type_white_list(): + op_type_white_list = xpu_test_op_type_white_list + if os.getenv('XPU_TEST_OP_TYPE_WHITE_LIST') is not None: + op_type_white_list.extend( + os.getenv('XPU_TEST_OP_TYPE_WHITE_LIST').strip().split(',')) + return list(set(op_type_white_list)) + + +def get_device_op_white_list(): + device_op_white_list = xpu_test_device_op_white_list + if os.getenv('XPU_TEST_DEVICE_OP_WHITE_LIST') is not None: + device_op_white_list.extend( + os.getenv('XPU_TEST_DEVICE_OP_WHITE_LIST').strip().split(',')) + return list(set(device_op_white_list)) + + +def get_device_op_type_white_list(): + device_op_type_white_list = xpu_test_device_op_type_white_list + if os.getenv('XPU_TEST_DEVICE_OP_TYPE_WHITE_LIST') is not None: + device_op_type_white_list.extend( + os.getenv('XPU_TEST_DEVICE_OP_TYPE_WHITE_LIST').strip().split(',')) + return list(set(device_op_type_white_list)) + + +def make_xpu_op_list(xpu_version): + ops = [] + raw_op_list = core.get_xpu_device_op_list(xpu_version) + version_str = "xpu2" if xpu_version == core.XPUVersion.XPU2 else "xpu1" + op_white_list = get_op_white_list() + type_white_list = get_type_white_list() + op_type_white_list = get_op_type_white_list() + device_op_white_list = get_device_op_white_list() + device_op_type_white_list = get_device_op_type_white_list() + print('op_white_list:', op_white_list) + print('type_white_list:', type_white_list) + print('op_type_white_list:', op_type_white_list) + print('device_op_white_list:', device_op_white_list) + print('device_op_type_white_list:', device_op_type_white_list) + + for op_name, type_list in raw_op_list.items(): + device_op_name = version_str + '_' + op_name + if op_name in op_white_list or device_op_name in device_op_white_list: + continue + for op_type in type_list: + if op_type in type_white_list or op_type not in type_dict_paddle_to_str.keys( + ): + continue + + device_op_type_name = device_op_name + '_' + type_dict_paddle_to_str[ + op_type] + if device_op_type_name in device_op_type_white_list: + continue + + op_type_name = op_name + '_' + type_dict_paddle_to_str[op_type] + if op_type_name in op_type_white_list: + continue + + ops.append(op_type_name) + return ops + + +def get_xpu_op_support_types(op_name, dev_id=0): + xpu_version = core.get_xpu_device_version(dev_id) + support_type_list = core.get_xpu_device_op_support_types(op_name, + xpu_version) + support_type_str_list = [ + type_dict_paddle_to_str[x] for x in support_type_list + ] + return support_type_str_list + + +def record_op_test(op_name, test_type): + dirname = os.getenv('XPU_OP_LIST_DIR') + filename = 'xpu_op_test' + if dirname is not None: + filename = os.path.join(dirname, filename) + with open(filename, 'a') as f: + fcntl.flock(f, fcntl.LOCK_EX) + f.write(op_name + '_' + test_type + '\n') + + +def is_empty_grad_op_type(xpu_version, op, test_type): + xpu_op_list = core.get_xpu_device_op_list(xpu_version) + grad_op = op + '_grad' + if grad_op not in xpu_op_list.keys(): + return True + + grad_op_types = xpu_op_list[op] + paddle_test_type = type_dict_str_to_paddle[test_type] + if paddle_test_type not in grad_op_types: + return True + + return False + + +def create_test_class(func_globals, + test_class, + test_type, + test_grad=True, + ignore_deivce_version=[], + test_deivce_version=[]): + xpu_version = core.get_xpu_device_version(0) + if xpu_version in ignore_deivce_version: + return + + if len(test_deivce_version) != 0 and xpu_version not in test_deivce_version: + return + + test_class_obj = test_class() + register_classes = inspect.getmembers(test_class_obj, inspect.isclass) + op_name = test_class_obj.op_name + no_grad = is_empty_grad_op_type(xpu_version, op_name, test_type) + + for test_class in register_classes: + if test_class[0] == '__class__': + continue + class_obj = test_class[1] + cls_name = "{0}_{1}".format(test_class[0], str(test_type)) + func_globals[cls_name] = type(cls_name, (class_obj, ), + {'in_type': test_type}) + + if hasattr(test_class_obj, 'use_dynamic_create_class' + ) and test_class_obj.use_dynamic_create_class: + base_class, dynamic_classes = test_class_obj.dynamic_create_class() + for dy_class in dynamic_classes: + cls_name = "{0}_{1}".format(dy_class[0], str(test_type)) + attr_dict = dy_class[1] + attr_dict['in_type'] = test_type + func_globals[cls_name] = type(cls_name, (base_class, ), attr_dict) + + record_op_test(op_name, test_type) + if not no_grad: + record_op_test(op_name + '_grad', test_type) + + +def get_test_cover_info(): + xpu_version = core.get_xpu_device_version(0) + version_str = "xpu2" if xpu_version == core.XPUVersion.XPU2 else "xpu1" + xpu_op_list = make_xpu_op_list(xpu_version) + xpu_op_covered = [] + + dirname = os.getenv('XPU_OP_LIST_DIR') + filename = 'xpu_op_test' + if dirname is not None: + filename = os.path.join(dirname, filename) + if os.path.exists(filename) and os.path.isfile(filename): + with open(filename) as f: + for line in f: + test_op_name = line.strip() + if test_op_name in xpu_op_list: + xpu_op_covered.append(test_op_name) + diff_list = list(set(xpu_op_list).difference(set(xpu_op_covered))) + total_len = len(set(xpu_op_list)) + covered_len = len(set(xpu_op_covered)) + print('{} test: {}/{}'.format(version_str, covered_len, total_len)) + if (len(diff_list) != 0): + print("These ops need to be tested on {0}! ops:{1}".format( + version_str, ','.join(diff_list))) + + +if __name__ == '__main__': + get_test_cover_info() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py new file mode 100644 index 0000000000000..cb54d12488d54 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py @@ -0,0 +1,297 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid import compiler, Program, program_guard + +import op_test +from op_test import OpTest, skip_check_grad_ci +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper + +paddle.enable_static() + + +def huber_loss_forward(val, delta): + abs_val = abs(val) + if abs_val <= delta: + return 0.5 * val * val + else: + return delta * (abs_val - 0.5 * delta) + + +# 1.动态生成不同参数的测试case,wrapper类中必须实现dynamic_create_class方法 +# self.use_dynamic_create_class置为True +class XPUTestArgsortOp1(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'argsort' + self.use_dynamic_create_class = True + + def dynamic_create_class(self): + base_class = self.TestArgsortOp + classes = [] + for descending in [True, False]: + for axis in [0, 1, 2, -1, -2]: + class_name = 'XPUTestArgsortOp_axis_' + str(axis) + attr_dict = {'init_axis': axis, 'descending': descending} + classes.append([class_name, attr_dict]) + return base_class, classes + + class TestArgsortOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "argsort" + self.place = paddle.XPUPlace(0) + self.dtype = self.in_type + self.input_shape = (2, 2, 2, 3, 3) + self.axis = -1 + self.descending = False + + if self.in_type == 'float32': + self.x = np.random.random(self.input_shape).astype(self.dtype) + else: + self.x = np.random.randint( + low=-1000, high=1000, + size=self.input_shape).astype(self.dtype) + self.inputs = {"X": self.x} + self.attrs = {"axis": self.axis, "descending": self.descending} + self.get_output() + self.outputs = {"Out": self.sorted_x, "Indices": self.indices} + + def get_output(self): + if self.descending: + self.indices = np.flip( + np.argsort( + self.x, kind='heapsort', axis=self.axis), + self.axis) + self.sorted_x = np.flip( + np.sort( + self.x, kind='heapsort', axis=self.axis), self.axis) + else: + self.indices = np.argsort( + self.x, kind='heapsort', axis=self.axis) + self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# 2. 为不同参数的测试case定义一个测试类,self.use_dynamic_create_class需要置为False +class XPUTestArgsortOp2(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'argsort' + self.use_dynamic_create_class = False + + class TestArgsortOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = "argsort" + self.place = paddle.XPUPlace(0) + self.init_dtype() + self.init_inputshape() + self.init_axis() + self.init_direction() + + if self.in_type == 'float32': + self.x = np.random.random(self.input_shape).astype(self.dtype) + else: + self.x = np.random.randint( + low=-1000, high=1000, + size=self.input_shape).astype(self.dtype) + self.inputs = {"X": self.x} + self.attrs = {"axis": self.axis, "descending": self.descending} + self.get_output() + self.outputs = {"Out": self.sorted_x, "Indices": self.indices} + + def get_output(self): + if self.descending: + self.indices = np.flip( + np.argsort( + self.x, kind='heapsort', axis=self.axis), + self.axis) + self.sorted_x = np.flip( + np.sort( + self.x, kind='heapsort', axis=self.axis), self.axis) + else: + self.indices = np.argsort( + self.x, kind='heapsort', axis=self.axis) + self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.no_need_check_grad = True + + def init_inputshape(self): + self.input_shape = (2, 2, 2, 3, 3) + + def init_dtype(self): + self.dtype = self.in_type + + def init_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_direction(self): + self.descending = False + + class TestArgsortOpAxis0XPU(TestArgsortOp): + def init_axis(self): + self.axis = 0 + + class TestArgsortOpAxis1XPU(TestArgsortOp): + def init_axis(self): + self.axis = 1 + + class TestArgsortOpAxis2XPU(TestArgsortOp): + def init_axis(self): + self.axis = 2 + + class TestArgsortOpAxisNeg1XPU(TestArgsortOp): + def init_axis(self): + self.axis = -1 + + class TestArgsortOpAxisNeg2XPU(TestArgsortOp): + def init_axis(self): + self.axis = -2 + + class TestArgsortOpDescendingAxisXPU(TestArgsortOp): + def init_direction(self): + self.descending = True + + class TestArgsortOpDescendingAxis0XPU(TestArgsortOpAxis0XPU): + def init_direction(self): + self.descending = True + + class TestArgsortOpDescendingAxis1XPU(TestArgsortOpAxis1XPU): + def init_direction(self): + self.descending = True + + class TestArgsortOpDescendingAxis2XPU(TestArgsortOpAxis2XPU): + def init_direction(self): + self.descending = True + + class TestArgsortOpDescendingAxisNeg1XPU(TestArgsortOpAxisNeg1XPU): + def init_direction(self): + self.descending = True + + class TestArgsortOpDescendingAxisNeg2XPU(TestArgsortOpAxisNeg2XPU): + def init_direction(self): + self.descending = True + + +support_types = get_xpu_op_support_types('argsort') +for stype in support_types: + create_test_class(globals(), XPUTestArgsortOp1, stype) + create_test_class(globals(), XPUTestArgsortOp2, stype) + + +class XPUTestHuberLossOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'huber_loss' + self.use_dynamic_create_class = False + + class TestHuberLossOp(XPUOpTest): + def setUp(self): + self.set_xpu() + self.op_type = 'huber_loss' + self.place = paddle.XPUPlace(0) + + self.init_dtype() + + self.set_inputs() + self.set_attrs() + self.set_outputs() + + def set_inputs(self): + shape = self.set_shape() + x = np.random.uniform(0, 1., shape).astype(self.dtype) + y = np.random.uniform(0, 1., shape).astype(self.dtype) + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(x), + 'Y': OpTest.np_dtype_to_fluid_dtype(y) + } + + def set_attrs(self): + self.attrs = {'delta': 0.5} + + def set_outputs(self): + delta = self.attrs['delta'] + shape = self.set_shape() + residual = self.inputs['Y'] - self.inputs['X'] + loss = np.vectorize(huber_loss_forward)(residual, + delta).astype(self.dtype) + self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)} + + def set_shape(self): + return (100, 1) + + def set_xpu(self): + self.__class__.use_xpu = True + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], 'Out', no_grad_set=set("residual")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set('residual')) + + class TestHuberLossOp1(TestHuberLossOp): + def set_shape(self): + return (640) + + class TestHuberLossOp2(TestHuberLossOp): + def set_shape(self): + return (10, 10) + + class TestHuberLossOp3(TestHuberLossOp): + def set_shape(self): + return (10, 10, 1) + + +support_types = get_xpu_op_support_types('huber_loss') +for stype in support_types: + create_test_class(globals(), XPUTestHuberLossOp, stype) + create_test_class( + globals(), + XPUTestHuberLossOp, + stype, + ignore_deivce_version=[core.XPUVersion.XPU1]) + +if __name__ == '__main__': + unittest.main() From fdab43b56692c93a5a732108cca66638796ed66f Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 21 Jan 2022 15:25:22 +0800 Subject: [PATCH 08/15] [MLU]add mlu ci dockerfile (#39021) * [MLU]add mlu ci dockerfile * fix comment * add cncl --- paddle/fluid/framework/tensor_util.h | 2 +- paddle/fluid/memory/memcpy.cc | 17 ++++++ paddle/fluid/operators/mean_op_mlu.cc | 8 +-- paddle/fluid/operators/mlu/CMakeLists.txt | 2 +- tools/dockerfile/Dockerfile.mlu | 73 +++++++++++++++++++++++ 5 files changed, 94 insertions(+), 8 deletions(-) create mode 100644 tools/dockerfile/Dockerfile.mlu diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 3c62f3c5e43d7..f0c41e6dc0fcf 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -235,7 +235,7 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_MLU - if (platform::is_mlu_place(dst_place)) { + else if (platform::is_mlu_place(dst_place)) { // NOLINT memory::Copy( dst_place, dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index f804c2af53916..6d348ceb87c83 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -962,6 +962,23 @@ void Copy(pten::Place dst_place, void* dst, stream); } +// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream. +template <> +void Copy(pten::CPUPlace dst_place, void* dst, + pten::Place src_place, const void* src, + size_t num, mluStream stream) { + Copy(pten::Place(dst_place.GetType()), dst, src_place, src, num, stream); +} + +// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream. +template <> +void Copy(pten::Place dst_place, void* dst, + pten::CPUPlace src_place, + const void* src, size_t num, + mluStream stream) { + Copy(dst_place, dst, pten::Place(src_place.GetType()), src, num, stream); +} + #endif // PADDLE_WITH_MLU // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. diff --git a/paddle/fluid/operators/mean_op_mlu.cc b/paddle/fluid/operators/mean_op_mlu.cc index 9862c2bd95256..ca4f3dcc3f465 100644 --- a/paddle/fluid/operators/mean_op_mlu.cc +++ b/paddle/fluid/operators/mean_op_mlu.cc @@ -35,9 +35,7 @@ class MeanMLUKernel : public framework::OpKernel { auto stream = context.template device_context().stream(); if (rank == 0) { // scalar - auto mlu_place = BOOST_GET(platform::MLUPlace, place); - memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T), - stream); + memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream); return; } @@ -85,9 +83,7 @@ class MeanMLUGradKernel : public framework::OpKernel { auto stream = context.template device_context().stream(); if (rank == 0) { // scalar - auto mlu_place = BOOST_GET(platform::MLUPlace, place); - memory::Copy(mlu_place, out_data, mlu_place, in_data, numel * sizeof(T), - stream); + memory::Copy(place, out_data, place, in_data, numel * sizeof(T), stream); return; } diff --git a/paddle/fluid/operators/mlu/CMakeLists.txt b/paddle/fluid/operators/mlu/CMakeLists.txt index 3fc411d6d13fa..59fab48b271d5 100644 --- a/paddle/fluid/operators/mlu/CMakeLists.txt +++ b/paddle/fluid/operators/mlu/CMakeLists.txt @@ -1,5 +1,5 @@ IF(WITH_MLU) - cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib) + cc_library(mlu_baseop SRCS mlu_baseop.cc DEPS neuware_lib framework_proto xxhash) cc_test(activation_op_mlu_test SRCS activation_op_mlu_test.cc DEPS op_registry activation_op scope device_context executor) ENDIF() diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu new file mode 100644 index 0000000000000..f7823738afc53 --- /dev/null +++ b/tools/dockerfile/Dockerfile.mlu @@ -0,0 +1,73 @@ +# A image for building paddle binaries +# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions +# +# Build: +# - CNTOOLKIT_VERSION 2.6.5-1 +# - CNNL_VERSION 1.8.3-1 +# - CNCL_VERSION 1.0.2-1 +# +# Download three packages from FTP (need to connect cambricon AE to get FTP url) +# - cntoolkit_2.6.5-1.ubuntu18.04_amd64.deb +# - cnnl_1.8.3-1.ubuntu18.04_amd64.deb +# - cncl_1.0.2-1.ubuntu18.04_amd64.deb +# copy them to current directory first, then run build commands +# +# For example: +# +# cd Paddle/tools/dockerfile +# +# (get cntoolkit pkg) +# (get cnnl pkg) +# (get cncl pkg) +# +# docker build -f Dockerfile.mlu \ +# --build-arg CNTOOLKIT_VERSION=2.6.5-1 \ +# --build-arg CNNL_VERSION=1.8.3-1 \ +# --build-arg CNCL_VERSION=1.0.2-1 \ +# -t paddlepaddle/paddle:latest-dev-mlu . +# +# without mlu device: +# docker run -it --network=host --pids-limit 409600 \ +# paddlepaddle/paddle:latest-dev-mlu /bin/bash +# +# with mlu device: +# docker run -it --network=host --pids-limit 409600 \ +# --device=/dev/cambricon_ctl --device=/dev/cambricon_dev0 \ +# paddlepaddle/paddle:latest-dev-mlu /bin/bash + +FROM registry.baidubce.com/paddlepaddle/paddle:latest-dev +MAINTAINER PaddlePaddle Authors + +ENV WITH_GPU=OFF + +ARG CNTOOLKIT_VERSION=2.6.5-1 +ARG CNNL_VERSION=1.8.3-1 +ARG CNCL_VERSION=1.0.2-1 +ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb +ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb +ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb + +# install cntoolkit +COPY $CNTOOLKIT_PKG ./ +RUN dpkg -i $CNTOOLKIT_PKG && \ + apt-get update && \ + apt-get install -y cnrt cnperf cnpapi cnlicense cngdb cndrv cndev cncodec cncc cnas cnbin cnstudio cnrtc cnpx && \ + rm -f $CNTOOLKIT_PKG + +ENV NEUWARE_HOME=/usr/local/neuware +ENV LD_LIBRARY_PATH=$NEUWARE_HOME/lib64:$LD_LIBRARY_PATH + +# install cnnl +COPY $CNNL_PKG ./ +RUN dpkg -i $CNNL_PKG && \ + rm -f $CNNL_PKG + +# install cncl +COPY $CNCL_PKG ./ +RUN dpkg -i $CNCL_PKG && \ + rm -f $CNCL_PKG + +# Clean +RUN apt-get clean -y + +EXPOSE 22 From b47fb7648c84721808fe7452be96d7a92b98c648 Mon Sep 17 00:00:00 2001 From: TeslaZhao Date: Fri, 21 Jan 2022 15:37:27 +0800 Subject: [PATCH 09/15] Keep strided_slice op behavior consistent with slice op when starts input is less than -rank (#39066) --- paddle/fluid/operators/strided_slice_op.h | 6 +----- .../fluid/tests/unittests/test_strided_slice_op.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h index eaef9496a92dc..47714ebb806e9 100644 --- a/paddle/fluid/operators/strided_slice_op.h +++ b/paddle/fluid/operators/strided_slice_op.h @@ -121,6 +121,7 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends, // stride must not be zero if (starts[axis_index] < 0) { starts[axis_index] = starts[axis_index] + axis_size; + starts[axis_index] = std::max(starts[axis_index], 0); } if (ends[axis_index] < 0) { if (!(ends[axis_index] == -1 && @@ -139,11 +140,6 @@ static void StridedSliceFunctor(int64_t* starts, int64_t* ends, } } - if ((starts[axis_index] < 0) && (axis_size > 0)) { - starts[axis_index] += axis_size; - starts[axis_index] = std::max(starts[axis_index], 0); - } - if (strides[axis_index] < 0) { reverse_axis[axis_index] = 1; strides[axis_index] = -strides[axis_index]; diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py index 9d89c7cbe1397..e9be6b338fb86 100644 --- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py @@ -216,6 +216,16 @@ def initTestCase(self): self.infer_flags = [1, 1, 1, 1, 1] +class TestStrideSliceOp14(TestStrideSliceOp): + def initTestCase(self): + self.input = np.random.rand(4, 4, 4, 4) + self.axes = [1, 2, 3] + self.starts = [-5, 0, -7] + self.ends = [-1, 2, 4] + self.strides = [1, 1, 1] + self.infer_flags = [1, 1, 1] + + class TestStrideSliceOpBool(TestStrideSliceOp): def test_check_grad(self): pass From df5152551d933487c7e9f0edd47c7066f2c95f86 Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Fri, 21 Jan 2022 15:38:02 +0800 Subject: [PATCH 10/15] modify DivideFunctor to match ElementwiseSameDims template (#39041) --- paddle/fluid/operators/mean_op.cu | 3 +-- paddle/pten/kernels/gpu/math_kernel.cu | 15 --------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu index 5a0afb68d63f1..63b5b871aabb5 100644 --- a/paddle/fluid/operators/mean_op.cu +++ b/paddle/fluid/operators/mean_op.cu @@ -59,8 +59,7 @@ class MeanCUDAKernel : public framework::OpKernel { return; } - using MT = typename details::MPTypeTrait::Type; - using Div = kernel_primitives::DivideFunctor; + using Div = kernel_primitives::DivideFunctor; std::vector reduce_dims; reduce_dims.reserve(rank); for (decltype(rank) i = 0; i < rank; ++i) { diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 80931db56c3de..d7a16ac49b1c9 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -52,21 +52,6 @@ namespace pten { dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } -/** - * Util Functors - */ - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) - : n_inv(static_cast(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T x) const { return x * n_inv; } - - private: - T n_inv; -}; - /** * Kernels */ From 814e5ab4e837a1c1270f67c6ca491da68b281a11 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Fri, 21 Jan 2022 16:36:50 +0800 Subject: [PATCH 11/15] Renamed selected_rows.* -> selected_rows_utils.* (#39037) --- paddle/fluid/distributed/service/brpc_utils.h | 2 +- .../distributed/table/depends/large_scale_kv.h | 2 +- paddle/fluid/eager/legacy/tensor_helper.cc | 2 +- paddle/fluid/framework/CMakeLists.txt | 14 +++++++------- paddle/fluid/framework/data_transform.h | 2 +- paddle/fluid/framework/data_type_transform.cc | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 6 +++--- .../fluid/framework/details/broadcast_op_handle.h | 2 +- .../framework/details/fused_broadcast_op_handle.h | 2 +- paddle/fluid/framework/details/gather_op_handle.h | 2 +- paddle/fluid/framework/details/reduce_and_gather.h | 2 +- paddle/fluid/framework/details/reduce_op_handle.h | 2 +- paddle/fluid/framework/details/variable_visitor.cc | 2 +- paddle/fluid/framework/operator.h | 2 +- paddle/fluid/framework/pten_utils.cc | 2 +- paddle/fluid/framework/pten_utils_test.cc | 2 +- .../{selected_rows.cc => selected_rows_utils.cc} | 2 +- .../{selected_rows.h => selected_rows_utils.h} | 0 ...ed_rows_test.cc => selected_rows_utils_test.cc} | 2 +- paddle/fluid/framework/var_type.h | 2 +- paddle/fluid/framework/var_type_traits.cc | 2 +- paddle/fluid/framework/var_type_traits_test.cc | 2 +- paddle/fluid/framework/variable.h | 2 +- paddle/fluid/framework/variable_helper.cc | 2 +- paddle/fluid/imperative/CMakeLists.txt | 10 +++++----- paddle/fluid/imperative/all_reduce.cc | 2 +- paddle/fluid/imperative/gloo_context.h | 2 +- paddle/fluid/imperative/gradient_accumulator.cc | 2 +- paddle/fluid/imperative/tests/CMakeLists.txt | 2 +- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/operators/clip_by_norm_op.h | 2 +- .../operators/fused/fused_embedding_seq_pool_op.h | 2 +- paddle/fluid/operators/lookup_table_dequant_op.h | 2 +- paddle/fluid/operators/lookup_table_op.h | 2 +- paddle/fluid/operators/lookup_table_v2_op.h | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 4 ++-- paddle/fluid/operators/math/matrix_bit_code.h | 2 +- .../fluid/operators/math/selected_rows_functor.h | 2 +- paddle/fluid/operators/nce_op.h | 2 +- paddle/fluid/operators/optimizers/sgd_op.h | 2 +- paddle/fluid/operators/save_op.h | 2 +- paddle/fluid/pybind/io.cc | 2 +- paddle/fluid/pybind/pybind.cc | 2 +- paddle/pten/api/lib/utils/CMakeLists.txt | 2 +- 44 files changed, 56 insertions(+), 56 deletions(-) rename paddle/fluid/framework/{selected_rows.cc => selected_rows_utils.cc} (99%) rename paddle/fluid/framework/{selected_rows.h => selected_rows_utils.h} (100%) rename paddle/fluid/framework/{selected_rows_test.cc => selected_rows_utils_test.cc} (99%) diff --git a/paddle/fluid/distributed/service/brpc_utils.h b/paddle/fluid/distributed/service/brpc_utils.h index 47de71d2087e9..556bbb1048e2c 100644 --- a/paddle/fluid/distributed/service/brpc_utils.h +++ b/paddle/fluid/distributed/service/brpc_utils.h @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/port.h" diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index ac11183d192ff..3b00f1d6ccc3a 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -33,7 +33,7 @@ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/framework/variable.h" diff --git a/paddle/fluid/eager/legacy/tensor_helper.cc b/paddle/fluid/eager/legacy/tensor_helper.cc index 97cac5a340419..2ee2f9fefa9a3 100644 --- a/paddle/fluid/eager/legacy/tensor_helper.cc +++ b/paddle/fluid/eager/legacy/tensor_helper.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 83e5c1c17925e..e4fe35b9b5c5a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -110,7 +110,7 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto scope) +cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows_utils framework_proto scope) if (WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() @@ -164,7 +164,7 @@ cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor - framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) + framework_proto selected_rows_utils data_device_transform data_type_transform data_layout_transform) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost enforce) cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc @@ -382,8 +382,8 @@ cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) -cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) +cc_library(selected_rows_utils SRCS selected_rows_utils.cc DEPS tensor) +cc_test(selected_rows_utils_test SRCS selected_rows_utils_test.cc DEPS selected_rows_utils) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) @@ -406,7 +406,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_library(generator SRCS generator.cc DEPS enforce place) -cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits pten_api_utils op_info) +cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info) # Get the current working branch execute_process( @@ -438,8 +438,8 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) -if(WITH_TESTING AND TEST selected_rows_test) - set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) +if(WITH_TESTING AND TEST selected_rows_utils_test) + set_tests_properties(selected_rows_utils_test PROPERTIES TIMEOUT 120) endif() cc_test(scope_guard_test SRCS scope_guard_test.cc) diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h index f8b36b48c308e..385a5ff704f51 100644 --- a/paddle/fluid/framework/data_transform.h +++ b/paddle/fluid/framework/data_transform.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_kernel_type.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index faff846cf2a60..5b6aedd2fe14b 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/transform.h" namespace paddle { diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 87f77ec2fff3a..66dfb81755f1c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,7 +12,7 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) -cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) +cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows_utils) if(WITH_PSCORE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") @@ -88,7 +88,7 @@ endif() cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) -cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows_utils reference_count_pass_helper) set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto multi_devices_helper @@ -114,7 +114,7 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) -cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows) +cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows_utils) cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 055c7e63863b3..8453da3c79066 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -21,7 +21,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index e08a768f8ce07..6ba6df7011ade 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h index 9cbd94cd6b877..575b7ca083d94 100644 --- a/paddle/fluid/framework/details/gather_op_handle.h +++ b/paddle/fluid/framework/details/gather_op_handle.h @@ -21,7 +21,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 9ecb2d8dbdd1c..583c34494bca4 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" namespace paddle { namespace framework { namespace details { diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 4b9f289eaa787..e9c913b0c8255 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc index be1371542f530..4315b6b0fc245 100644 --- a/paddle/fluid/framework/details/variable_visitor.cc +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/framework/details/variable_visitor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" namespace pten { class DenseTensor; diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ad84dbc9be6d2..8e000ef9985bd 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -32,7 +32,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/memory/malloc.h" diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 4e33e641cf1fc..2fd5b87b7f3fd 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc index ab2d60a34303a..004345fa1e571 100644 --- a/paddle/fluid/framework/pten_utils_test.cc +++ b/paddle/fluid/framework/pten_utils_test.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/pten_utils.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows_utils.cc similarity index 99% rename from paddle/fluid/framework/selected_rows.cc rename to paddle/fluid/framework/selected_rows_utils.cc index 6cad0915be736..c33ee655c2a98 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows_utils.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows_utils.h similarity index 100% rename from paddle/fluid/framework/selected_rows.h rename to paddle/fluid/framework/selected_rows_utils.h diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_utils_test.cc similarity index 99% rename from paddle/fluid/framework/selected_rows_test.cc rename to paddle/fluid/framework/selected_rows_utils_test.cc index 3b0509e0344ef..7a9f86041d996 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_utils_test.cc @@ -13,7 +13,7 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 2e35f9b845ac7..5747df57c4568 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/variable.h" diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index eb8a1e4cea9fb..401ccb03d78d6 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/macros.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 9a9b90cd81179..812a34112a465 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h index 2fa48150903ad..188b00d818de3 100644 --- a/paddle/fluid/framework/variable.h +++ b/paddle/fluid/framework/variable.h @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index 37ec5d7bc83bd..34ab07def54c1 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 594b0d48a8aad..d0f8d39f927f6 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,8 +1,8 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags flags) IF(WITH_XPU) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) ELSE() -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten pten_utils) ENDIF() cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) @@ -13,7 +13,7 @@ cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradien cc_library(imperative_profiler SRCS profiler.cc DEPS flags) if(NOT WIN32) if(WITH_NCCL OR WITH_RCCL) - cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor) + cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows_utils tensor) cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits) if(WITH_NCCL) nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce) @@ -43,9 +43,9 @@ if(WITH_GLOO) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows selected_rows_functor var_type_traits layer math_function npu_op_runner) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc index 78855cc5c9e2e..d1d6a0f5adf58 100644 --- a/paddle/fluid/imperative/all_reduce.cc +++ b/paddle/fluid/imperative/all_reduce.cc @@ -25,7 +25,7 @@ #endif #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index e7c9ba4cfddb6..f13bb859eee93 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -17,7 +17,7 @@ #include #include #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 2056b8622052b..092872247cca5 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -19,7 +19,7 @@ #include #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 32e982f1f15ca..88f8076885e2f 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -12,7 +12,7 @@ else() endif(WIN32) -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows selected_rows_functor gradient_accumulator math_function) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 985f870ded4e7..f8a27da00ba2b 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -156,7 +156,7 @@ endif() cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lapack_function +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling segment_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h index fb21d9fec90ca..adb2a2fcfa3a7 100644 --- a/paddle/fluid/operators/clip_by_norm_op.h +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 8713d58034241..4e4322947a857 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" diff --git a/paddle/fluid/operators/lookup_table_dequant_op.h b/paddle/fluid/operators/lookup_table_dequant_op.h index af99c6e98c5ad..70aad1d3238f2 100644 --- a/paddle/fluid/operators/lookup_table_dequant_op.h +++ b/paddle/fluid/operators/lookup_table_dequant_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/math/blas.h" diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 74e26626bd528..a89d5fb7cb6e5 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h index 4e8d96afa03c4..54564395c6d04 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index fcf988efcd34c..65bf595bcebb8 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -66,9 +66,9 @@ math_library(maxouting) math_library(pooling) if(WITH_MKLDNN) - math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler) + math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) else() - math_library(selected_rows_functor DEPS selected_rows math_function blas) + math_library(selected_rows_functor DEPS selected_rows_utils math_function blas) endif() math_library(sequence2batch) diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 22e5256335c73..71d905214ab9f 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -19,7 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index a1eb69db7cfce..8ba7851d7b979 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 364a0f02e3ab7..55f684b66485b 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -22,7 +22,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index a1fb3debb48e6..9d98e745a01ae 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/platform/bfloat16.h" diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index e44a5c77bd841..5ed71a26c8aa3 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" namespace paddle { diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc index fc49f76305461..88a43f9428b22 100644 --- a/paddle/fluid/pybind/io.cc +++ b/paddle/fluid/pybind/io.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/io.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind_boost_headers.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cd999f17f3a2f..cdbfa11abec72 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -54,7 +54,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/save_load_util.h" #include "paddle/fluid/framework/scope_pool.h" -#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/framework/type_defs.h" diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt index a4db8c4b193b6..74ecb3cd65262 100644 --- a/paddle/pten/api/lib/utils/CMakeLists.txt +++ b/paddle/pten/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS -tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits) +tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits) From 06803c29a387e52756ddeffb8e97e25062e237aa Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 21 Jan 2022 16:37:55 +0800 Subject: [PATCH 12/15] [pten] add concat pten kernel (#38955) --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/lod_tensor.cc | 42 +- paddle/fluid/framework/lod_tensor.h | 14 - paddle/fluid/framework/lod_tensor_test.cc | 5 +- paddle/fluid/framework/operator.cc | 4 + paddle/fluid/imperative/prepared_operator.cc | 4 + .../fluid/operators/array_to_lod_tensor_op.cc | 3 +- paddle/fluid/operators/concat_op.cc | 15 +- paddle/fluid/operators/concat_op.h | 111 +--- paddle/fluid/operators/concat_op_xpu.cc | 6 +- .../fluid/operators/lod_tensor_to_array_op.cc | 3 +- .../fluid/operators/math/concat_and_split.cc | 81 +-- .../fluid/operators/math/concat_and_split.cu | 435 +------------ paddle/fluid/operators/merge_lod_tensor_op.cc | 4 +- .../fluid/operators/shrink_rnn_memory_op.cc | 4 +- paddle/fluid/operators/split_lod_tensor_op.cc | 3 +- paddle/fluid/pybind/pybind.cc | 4 +- paddle/pten/CMakeLists.txt | 2 +- paddle/pten/api/include/kernel_signature.h | 5 + paddle/pten/api/lib/utils/tensor_utils.cc | 9 +- paddle/pten/core/CMakeLists.txt | 1 + paddle/pten/core/kernel_context.h | 2 +- paddle/pten/core/lod_utils.cc | 59 ++ paddle/pten/core/lod_utils.h | 37 ++ paddle/pten/infermeta/multiary.cc | 41 +- paddle/pten/infermeta/multiary.h | 11 +- paddle/pten/kernels/CMakeLists.txt | 2 +- paddle/pten/kernels/concat_kernel.h | 43 ++ paddle/pten/kernels/cpu/concat_and_split.h | 138 +++++ paddle/pten/kernels/cpu/concat_kernel.cc | 125 ++++ paddle/pten/kernels/funcs/concat_funcs.h | 95 +++ paddle/pten/kernels/gpu/concat_and_split.h | 569 ++++++++++++++++++ paddle/pten/kernels/gpu/concat_kernel.cu | 125 ++++ paddle/pten/tests/api/CMakeLists.txt | 1 + paddle/pten/tests/api/test_concat_api.cc | 86 +++ paddle/pten/tests/kernels/CMakeLists.txt | 1 + .../pten/tests/kernels/test_concat_dev_api.cc | 82 +++ python/paddle/utils/code_gen/api.yaml | 10 + python/paddle/utils/code_gen/api_gen.py | 39 +- 39 files changed, 1552 insertions(+), 671 deletions(-) create mode 100644 paddle/pten/core/lod_utils.cc create mode 100644 paddle/pten/core/lod_utils.h create mode 100644 paddle/pten/kernels/concat_kernel.h create mode 100644 paddle/pten/kernels/cpu/concat_and_split.h create mode 100644 paddle/pten/kernels/cpu/concat_kernel.cc create mode 100644 paddle/pten/kernels/funcs/concat_funcs.h create mode 100644 paddle/pten/kernels/gpu/concat_and_split.h create mode 100644 paddle/pten/kernels/gpu/concat_kernel.cu create mode 100644 paddle/pten/tests/api/test_concat_api.cc create mode 100644 paddle/pten/tests/kernels/test_concat_dev_api.cc diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e4fe35b9b5c5a..286a8684127a9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -94,7 +94,7 @@ else() endif() cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim mixed_vector place tensor framework_proto version) -cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) +cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_utils lod_tensor memory) if(WITH_GPU) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index a4b9fff8ecd15..ab2e30a15ea15 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -117,7 +117,8 @@ bool CheckLoD(const LoD &in, int tensor_height) { } // check: the lowest level's last offset should equals `tensor_height` if // tensor_height>0. - if (tensor_height > 0 && (size_t)tensor_height != in.back().back()) + if (tensor_height > 0 && + static_cast(tensor_height) != in.back().back()) return false; // check: the higher level's last offset should equals the lower level's @@ -150,7 +151,7 @@ bool CheckAbsLoD(const LoD &in, int tensor_height) { if (level.front() != 0) return false; if (tensor_height < 0) { tensor_height = level.back(); - } else if ((size_t)tensor_height != level.back()) { + } else if (static_cast(tensor_height) != level.back()) { return false; } } @@ -186,27 +187,6 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, return LoDAndOffset{sub_lod, {start_idx, end_idx}}; } -void AppendLoD(LoD *lod, const LoD &lod_length) { - PADDLE_ENFORCE( - lod->empty() || lod->size() == lod_length.size(), - platform::errors::InvalidArgument( - "The input LoD length should be equal to the appended LoD size, but " - "received input LoD length is %d, actual LoD size is %d.", - lod_length, lod->size())); - if (lod->empty()) { - for (size_t i = 0; i < lod_length.size(); ++i) { - lod->emplace_back(1, 0); // size = 1, value = 0; - } - *lod = LoD(lod_length.size(), std::vector({0})); - } - for (size_t i = 0; i < lod->size(); ++i) { - auto &level = (*lod)[i]; - for (size_t len : lod_length[i]) { - level.push_back(level.back() + len); - } - } -} - void SerializeToStream(std::ostream &os, const LoDTensor &tensor, const platform::DeviceContext &dev_ctx) { { // the 1st field, uint32_t version for LoDTensor @@ -313,22 +293,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, TensorFromStream(is, static_cast(tensor), dev_ctx); } -LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { - LoD length_lod; - length_lod.reserve(offset_lod.size()); - for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { - std::vector level; - if (offset_lod[lvl].size() > 0) { - level.reserve(offset_lod[lvl].size() - 1); - } - for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { - level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); - } - length_lod.push_back(level); - } - return length_lod; -} - LoD ConvertToOffsetBasedLoD(const LoD &length_lod) { LoD offset_lod; offset_lod.reserve(length_lod.size()); diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 14727c190b581..63680c008bf66 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -157,8 +157,6 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, std::pair> GetSubLoDAndAbsoluteOffset( const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); -void AppendLoD(LoD* lod, const LoD& lod_length); - /* * Serialize/Desiralize LoDTensor to std::ostream * You can pass ofstream or ostringstream to serilize to file @@ -173,18 +171,6 @@ void DeserializeFromStream(std::istream& is, LoDTensor* tensor, const size_t& seek, const std::vector& shape); -/* - * Convert between length-based LoD and offset-based LoD. - * The implementation of LoDTensor class use offset-based LoD. - * However, we want to expose the more user-friendly length-based - * LoD to the Python side instead. - * - * Example: - * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]] - * then length_lod = [[2, 1], [3, 2, 4]] - */ -LoD ConvertToLengthBasedLoD(const LoD& offset_lod); - LoD ConvertToOffsetBasedLoD(const LoD& length_lod); void SerializeToStream(std::ostream& os, const LoDTensor& tensor); diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index 917bb7cc096c2..5e72c2d3d7e94 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/pten/core/lod_utils.h" namespace paddle { namespace framework { @@ -98,7 +99,7 @@ TEST(LoD, AppendLoD) { origin.push_back(std::vector({0, 1, 6})); origin.push_back(std::vector({0, 2, 5, 7, 10, 12, 15})); - paddle::framework::AppendLoD(&origin, lod_lens); + pten::AppendLoD(&origin, lod_lens); LoD expected; expected.push_back(std::vector({0, 2, 4})); @@ -277,7 +278,7 @@ TEST(LoD, ConvertToLengthBasedLoD) { offset_lod.push_back(std::vector({0, 1, 3})); offset_lod.push_back(std::vector({0, 2, 4, 5})); - LoD length_lod = ConvertToLengthBasedLoD(offset_lod); + LoD length_lod = pten::ConvertToLengthBasedLoD(offset_lod); LoD expected; expected.push_back(std::vector({2})); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e69a6c2e88c6b..33a4e5d2f3906 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1978,6 +1978,10 @@ void OperatorWithKernel::BuildPtenKernelContext( std::type_index(typeid(std::string))) { pt_kernel_context->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + pt_kernel_context->EmplaceBackAttr( + std::move(pten::Scalar(BOOST_GET_CONST(int, attr)))); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to Scalar when construct " diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index d28595a6a4c75..fe60f05e1da43 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -438,6 +438,10 @@ static void BuildDygraphPtenKernelContext( std::type_index(typeid(std::string))) { kernel_ctx->EmplaceBackAttr( std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + kernel_ctx->EmplaceBackAttr( + std::move(pten::Scalar(BOOST_GET_CONST(int, attr)))); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported cast op attribute `%s` to Scalar when construct " diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 1680ad528abf9..a959067ddba62 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/core/lod_utils.h" namespace paddle { namespace framework { @@ -168,7 +169,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { x[x_idx].lod(), idx, idx + 1, 0); auto &lod_length = lod_and_offset.first; - framework::AppendLoD(out_lod, lod_length); + pten::AppendLoD(out_lod, lod_length); size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index e6b1f6a1c18c3..9eba127a9b3ce 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -19,6 +19,8 @@ limitations under the License. */ #include #include +#include "paddle/pten/kernels/funcs/concat_funcs.h" + #ifdef PADDLE_WITH_MKLDNN #include #endif @@ -56,8 +58,8 @@ class ConcatOp : public framework::OperatorWithKernel { size_t axis = ComputeAxis(static_cast(ctx->Attrs().Get("axis")), static_cast(inputs_dims[0].size())); - framework::DDim out_dims = - ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis); + framework::DDim out_dims = pten::funcs::ComputeAndCheckShape( + ctx->IsRuntime(), inputs_dims, axis); if (out_dims[axis] < 0) { out_dims[axis] = -1; } @@ -102,6 +104,15 @@ class ConcatOp : public framework::OperatorWithKernel { return framework::OpKernelType(expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } + + framework::KernelSignature GetExpectedPtenKernelArgs( + const framework::ExecutionContext &ctx) const override { + if (ctx.HasInput("AxisTensor")) { + return framework::KernelSignature("concat", {"X"}, {"AxisTensor"}, + {"Out"}); + } + return framework::KernelSignature("concat", {"X"}, {"axis"}, {"Out"}); + } }; class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index bb72174be5ed5..3eaffbdc8bf35 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -22,54 +22,11 @@ limitations under the License. */ #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/utils.h" +#include "paddle/pten/kernels/concat_kernel.h" +#include "paddle/pten/kernels/funcs/concat_funcs.h" + namespace paddle { namespace operators { -static inline framework::DDim ComputeAndCheckShape( - const bool is_runtime, const std::vector& inputs_dims, - const size_t axis) { - const size_t n = inputs_dims.size(); - auto out_dims = inputs_dims[0]; - size_t in_zero_dims_size = out_dims.size(); - for (size_t i = 1; i < n; i++) { - PADDLE_ENFORCE_EQ(inputs_dims[i].size(), out_dims.size(), - platform::errors::InvalidArgument( - "The shape of input[0] and input[%d] " - "is expected to be equal." - "But received input[0]'s shape = " - "[%s], input[%d]'s shape = [%s].", - i, inputs_dims[0], i, inputs_dims[i])); - for (size_t j = 0; j < in_zero_dims_size; j++) { - if (j == axis) { - if (is_runtime) { - out_dims[axis] += inputs_dims[i][j]; - } else { - if (inputs_dims[i][j] == -1 || out_dims[j] == -1) { - out_dims[axis] = -1; - } else { - out_dims[axis] += inputs_dims[i][j]; - } - } - } else { - bool check_shape = - is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0); - if (check_shape) { - // check all shape in run time - PADDLE_ENFORCE_EQ(inputs_dims[0][j], inputs_dims[i][j], - platform::errors::InvalidArgument( - "The %d-th dimension of input[0] and input[%d] " - "is expected to be equal." - "But received input[0]'s shape = " - "[%s], input[%d]'s shape = [%s].", - j, i, inputs_dims[0], i, inputs_dims[i])); - } - if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) { - out_dims[j] = inputs_dims[i][j]; - } - } - } - } - return out_dims; -} static inline int64_t ComputeAxis(int64_t axis, int64_t rank) { PADDLE_ENFORCE_EQ( @@ -109,67 +66,21 @@ class ConcatKernel : public framework::OpKernel { ins_dims[i] = ins[i]->dims(); } - framework::DDim out_dims = ComputeAndCheckShape(true, ins_dims, axis); + framework::DDim out_dims = + pten::funcs::ComputeAndCheckShape(true, ins_dims, axis); out->Resize(out_dims); } auto place = ctx.GetPlace(); out->mutable_data(place); - // If axis is 0, the lod of the output is not the same as inputs. - if (axis == 0 && ins[0]->lod().size() > 0) { - size_t lod_size_0 = ins[0]->lod().size(); - size_t lod_size = lod_size_0; - for (size_t i = 1; i < ins.size(); ++i) { - if (ins[i]->lod().size() > 0) { - PADDLE_ENFORCE_EQ( - ins[i]->lod().size(), lod_size_0, - platform::errors::Unimplemented( - "The lod level of all input LoDTensors should be same. " - "Maybe different lod level of input LoDTensors can concat," - "it is not supported currently. The lod level of %dth input " - "is %d and first input is %d.", - i, ins[i]->lod().size(), lod_size_0)); - } else { - lod_size = 0; - break; - } - } - if (lod_size) { - auto* out_lod = out->mutable_lod(); - for (size_t i = 1; i < ins.size(); ++i) { - auto in_lod = ConvertToLengthBasedLoD(ins[i]->lod()); - AppendLoD(out_lod, in_lod); - } - } + // call new kernel + auto& dev_ctx = ctx.device_context(); + std::vector pt_ins; + for (auto& in : ins) { + pt_ins.push_back(*in); } - // Sometimes direct copies will be faster, this maybe need deeply analysis. - if (axis == 0 && ins.size() < 10) { - size_t output_offset = 0; - for (auto* in : ins) { - if (!in || in->numel() == 0UL) { - continue; - } - auto in_stride = framework::stride_numel(in->dims()); - auto out_stride = framework::stride_numel(out->dims()); - StridedNumelCopyWithAxis(ctx.device_context(), axis, - out->data() + output_offset, out_stride, - in->data(), in_stride, in_stride[axis]); - output_offset += in_stride[axis]; - } - } else { - std::vector inputs; - for (size_t j = 0; j < ins.size(); ++j) { - if (ins[j] && ins[j]->numel() > 0) { - inputs.push_back(*ins[j]); - } else { - continue; - } - } - auto& dev_ctx = ctx.template device_context(); - paddle::operators::math::ConcatFunctor concat_functor; - concat_functor(dev_ctx, inputs, static_cast(axis), out); - } + pten::ConcatKernel(dev_ctx, pt_ins, axis, out); } }; diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index 0ff11e11165f0..aa10a58738bbd 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -18,6 +18,8 @@ limitations under the License. */ #include #include "paddle/fluid/platform/device/xpu/xpu_header.h" +#include "paddle/pten/core/lod_utils.h" + namespace paddle { namespace operators { using Tensor = framework::Tensor; @@ -69,8 +71,8 @@ class ConcatXPUKernel : public framework::OpKernel { if (lod_size) { auto* out_lod = out->mutable_lod(); for (size_t i = 1; i < ins.size(); ++i) { - auto in_lod = ConvertToLengthBasedLoD(ins[i]->lod()); - AppendLoD(out_lod, in_lod); + auto in_lod = pten::ConvertToLengthBasedLoD(ins[i]->lod()); + pten::AppendLoD(out_lod, in_lod); } } } diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc index e02972bd75353..5f39a9afa94ba 100644 --- a/paddle/fluid/operators/lod_tensor_to_array_op.cc +++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/core/lod_utils.h" namespace paddle { namespace framework { @@ -134,7 +135,7 @@ class LoDTensorToArrayOp : public framework::OperatorBase { auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( x.lod(), start_idx, start_idx + 1, rank_level + 1); auto &lod_length = lod_and_offset.first; - framework::AppendLoD(&lod, lod_length); + pten::AppendLoD(&lod, lod_length); size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 4f12630d1e02f..a9f2680660bd2 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/concat_and_split.h" + +#include "paddle/pten/kernels/cpu/concat_and_split.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif @@ -44,36 +46,9 @@ class ConcatFunctor { void operator()(const platform::CPUDeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - // TODO(zcd): Add input data validity checking - size_t num = input.size(); - - int64_t rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int64_t out_rows = rows, out_cols = 0; - - std::vector input_cols(input.size()); - for (size_t i = 0; i < num; ++i) { - int64_t t_cols = input[i].numel() / rows; - out_cols += t_cols; - input_cols[i] = t_cols; - } - auto cpu_place = context.GetPlace(); - - // computation - auto output_data = output->data(); - int64_t col_idx = 0; - for (size_t j = 0; j < num; ++j) { - int64_t col_len = input_cols[j]; - auto input_data = input[j].data(); - for (int64_t k = 0; k < out_rows; ++k) { - memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place, - input_data + k * col_len, sizeof(T) * col_len); - } - col_idx += col_len; - } + std::vector pt_input{input.begin(), input.end()}; + pten::ConcatImpl(context, pt_input, axis, + output); } }; @@ -88,46 +63,12 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, const int axis, std::vector* outputs) { - // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 - // tensors of shape [0,1,4] - if (input.numel() == 0) { - return; - } - - // TODO(zcd): Add input data validity checking - size_t num = outputs->size(); - - int input_rows = 1; - auto dim_0 = ref_inputs[0]->dims(); - for (int i = 0; i < axis; ++i) { - input_rows *= dim_0[i]; - } - - int input_cols = 0; - - std::vector output_cols(outputs->size()); - for (size_t i = 0; i < num; ++i) { - int t_cols = ref_inputs[i]->numel() / input_rows; - input_cols += t_cols; - output_cols[i] = t_cols; - } - auto cpu_place = context.GetPlace(); - - // computation - for (int k = 0; k < input_rows; ++k) { - const T* src_ptr = input.data() + k * input_cols; - int col_idx = 0; - for (size_t j = 0; j < num; ++j) { - int col_len = output_cols[j]; - auto* out_tensor = outputs->at(j); - if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->data() + k * col_len; - memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, - sizeof(T) * col_len); - } - col_idx += col_len; - } - } + std::vector pt_ref_inputs{ref_inputs.begin(), + ref_inputs.end()}; + std::vector pt_outputs{outputs->begin(), + outputs->end()}; + pten::SplitImpl( + context, input, pt_ref_inputs, axis, &pt_outputs); } }; diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 5b99a62d78d2a..4357a86b7e65d 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -12,218 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/kernels/gpu/concat_and_split.h" namespace paddle { namespace operators { namespace math { -template -__global__ void ConcatKernel(const T** inputs, const int64_t* input_cols, - int col_size, const int64_t output_rows, - const int64_t output_cols, T* output) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int curr_segment = 0; - int curr_offset = input_cols[0]; - for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - int curr_col_offset = input_cols[curr_segment + 1]; - while (curr_col_offset <= tid_x) { - curr_offset = curr_col_offset; - ++curr_segment; - curr_col_offset = input_cols[curr_segment + 1]; - } - - int local_col = tid_x - curr_offset; - int segment_width = curr_col_offset - curr_offset; - - const T* input_ptr = inputs[curr_segment]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) - output[tid_y * output_cols + tid_x] = - input_ptr[tid_y * segment_width + local_col]; - } -} - -template -__device__ void ConcatKernelDetail(const T** inputs_data, - const int fixed_in_col, const int out_rows, - const int out_cols, T* output_data) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * 1.0 / fixed_in_col; - int in_offset = tid_x - split * fixed_in_col; - const T* input_ptr = inputs_data[split]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) { - output_data[tid_y * out_cols + tid_x] = - input_ptr[tid_y * fixed_in_col + in_offset]; - } - } -} - -template -__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1, - const int64_t fixed_in_col, const int64_t out_rows, - const int64_t out_cols, T* output_data) { - const T* inputs_data[2]; - inputs_data[0] = input_addr0; - inputs_data[1] = input_addr1; - ConcatKernelDetail(inputs_data, fixed_in_col, out_rows, out_cols, - output_data); -} - -template -__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1, - const T* input_addr2, const int64_t fixed_in_col, - const int64_t out_rows, const int64_t out_cols, - T* output_data) { - const T* inputs_data[3]; - inputs_data[0] = input_addr0; - inputs_data[1] = input_addr1; - inputs_data[2] = input_addr2; - ConcatKernelDetail(inputs_data, fixed_in_col, out_rows, out_cols, - output_data); -} - -template -__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1, - const T* input_addr2, const T* input_addr3, - const int64_t fixed_in_col, const int64_t out_rows, - const int64_t out_cols, T* output_data) { - const T* inputs_data[4]; - inputs_data[0] = input_addr0; - inputs_data[1] = input_addr1; - inputs_data[2] = input_addr2; - inputs_data[3] = input_addr3; - ConcatKernelDetail(inputs_data, fixed_in_col, out_rows, out_cols, - output_data); -} - -template -__global__ void ConcatKernel(const T** inputs_data, const int in_num, - const int64_t fixed_in_col, const int64_t out_rows, - const int64_t out_cols, T* output_data) { - ConcatKernelDetail(inputs_data, fixed_in_col, out_rows, out_cols, - output_data); -} - -template -__global__ void SplitKernel(const T* input_data, const int64_t in_row, - const int64_t in_col, const int64_t* out_cols, - int out_cols_size, T** outputs_data) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int curr_segment = 0; - int curr_offset = out_cols[0]; - for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { - int curr_col_offset = out_cols[curr_segment + 1]; - while (curr_col_offset <= tid_x) { - curr_offset = curr_col_offset; - ++curr_segment; - curr_col_offset = out_cols[curr_segment + 1]; - } - - int local_col = tid_x - curr_offset; - int segment_width = curr_col_offset - curr_offset; - T* output_ptr = outputs_data[curr_segment]; - if (output_ptr != nullptr) { - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * segment_width + local_col] = - input_data[tid_y * in_col + tid_x]; - } - } -} - -template -__device__ void SplitKernelDetail(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, - T** outputs_data) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { - int split = tid_x / fixed_out_col; - int in_offset = tid_x - split * fixed_out_col; - T* output_ptr = outputs_data[split]; - if (output_ptr != nullptr) { - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * fixed_out_col + in_offset] = - input_data[tid_y * in_col + tid_x]; - } - } -} - -template -__global__ void SplitKernel(const T* input_data, const int64_t in_row, - const int64_t in_col, const int64_t fixed_out_col, - T** outputs_data) { - SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); -} - -template -__global__ void SplitKernel(const T* input_data, const int64_t in_row, - const int64_t in_col, const int64_t fixed_out_col, - T* outputs_addr0, T* outputs_addr1) { - T* outputs_data[2]; - outputs_data[0] = outputs_addr0; - outputs_data[1] = outputs_addr1; - SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); -} - -template -__global__ void SplitKernel(const T* input_data, const int64_t in_row, - const int64_t in_col, const int64_t fixed_out_col, - T* outputs_addr0, T* outputs_addr1, - T* outputs_addr2) { - T* outputs_data[3]; - outputs_data[0] = outputs_addr0; - outputs_data[1] = outputs_addr1; - outputs_data[2] = outputs_addr2; - SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); -} - -template -__global__ void SplitKernel(const T* input_data, const int64_t in_row, - const int64_t in_col, const int64_t fixed_out_col, - T* outputs_addr0, T* outputs_addr1, - T* outputs_addr2, T* outputs_addr3) { - T* outputs_data[4]; - outputs_data[0] = outputs_addr0; - outputs_data[1] = outputs_addr1; - outputs_data[2] = outputs_addr2; - outputs_data[3] = outputs_addr3; - SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); -} - -static inline void GetBlockDims(const platform::CUDADeviceContext& context, - int64_t num_rows, int64_t num_cols, - dim3* block_dims, dim3* grid_dims) { - // Set the thread block and grid according to CurrentDeviceId - const int kThreadsPerBlock = 1024; - int block_cols = kThreadsPerBlock; - if (num_cols < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((num_cols + 31) >> 5) << 5; - } - int block_rows = kThreadsPerBlock / block_cols; - *block_dims = dim3(block_cols, block_rows, 1); - - int max_threads = context.GetMaxPhysicalThreadCount(); - int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - - int grid_cols = - std::min((num_cols + block_cols - 1) / block_cols, max_blocks); - int grid_rows = std::min(max_blocks / grid_cols, - std::max(num_rows / block_rows, (int64_t)1)); - *grid_dims = dim3(grid_cols, grid_rows, 1); -} - /* * All tensors' dimension should be the same and the values of * each dimension must be the same, except the axis dimension. @@ -234,112 +29,10 @@ class ConcatFunctor { void operator()(const platform::CUDADeviceContext& context, const std::vector& input, int axis, framework::Tensor* output) { - // TODO(zcd): Add input data validity checking - int in_num = input.size(); - int64_t in_row = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - in_row *= dim_0[i]; - } - int64_t in_col = input[0].numel() / in_row; - int64_t out_row = in_row, out_col = 0; - - int inputs_col_num = in_num + 1; - std::vector inputs_data_vec(in_num); - std::vector inputs_col_vec(inputs_col_num); - const T** inputs_data = inputs_data_vec.data(); - int64_t* inputs_col = inputs_col_vec.data(); - -// There are some differences between hip runtime and NV runtime. -// In NV, when the pageable memory data less than 64K is transferred from -// hosttodevice, it will be automatically asynchronous. -// However, only pinned memory in hip can copy asynchronously -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device -// 3.2.6.1. Concurrent Execution between Host and Device -// Memory copies from host to device of a memory block of 64 KB or less -#ifdef PADDLE_WITH_HIP - memory::AllocationPtr data_alloc, col_alloc; - data_alloc = - memory::Alloc(platform::CUDAPinnedPlace(), in_num * sizeof(T*)); - inputs_data = reinterpret_cast(data_alloc->ptr()); - col_alloc = memory::Alloc(platform::CUDAPinnedPlace(), - inputs_col_num * sizeof(int)); - inputs_col = reinterpret_cast(col_alloc->ptr()); -#endif - - inputs_col[0] = 0; - bool has_same_shape = true; - for (int i = 0; i < in_num; ++i) { - int64_t t_cols = input[i].numel() / in_row; - if (has_same_shape) { - if (t_cols != in_col) has_same_shape = false; - } - out_col += t_cols; - inputs_col[i + 1] = out_col; - inputs_data[i] = input[i].data(); - } - - dim3 block_dims; - dim3 grid_dims; - GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims); + std::vector pt_input{input.begin(), input.end()}; - memory::allocation::AllocationPtr tmp_dev_ins_data; - const T** dev_ins_data = nullptr; - if (!has_same_shape || in_num < 2 || in_num > 4) { - tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*)); - auto* restored = - platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num); - memory::Copy(context.GetPlace(), tmp_dev_ins_data->ptr(), - platform::CPUPlace(), restored, in_num * sizeof(T*), - context.stream()); - dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); - } - - if (has_same_shape) { - if (in_num == 2) { - ConcatKernel<<>>( - inputs_data[0], inputs_data[1], in_col, out_row, out_col, - output->data()); - } else if (in_num == 3) { - ConcatKernel<<>>( - inputs_data[0], inputs_data[1], inputs_data[2], in_col, out_row, - out_col, output->data()); - } else if (in_num == 4) { - ConcatKernel<<>>( - inputs_data[0], inputs_data[1], inputs_data[2], inputs_data[3], - in_col, out_row, out_col, output->data()); - } else { - ConcatKernel<<>>( - dev_ins_data, in_num, in_col, out_row, out_col, output->data()); - } - } else { - auto tmp_dev_ins_col_data = - memory::Alloc(context, inputs_col_num * sizeof(int64_t)); - - auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( - inputs_col, inputs_col_num); - memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(), - platform::CPUPlace(), restored, - inputs_col_num * sizeof(int64_t), context.stream()); - int64_t* dev_ins_col_data = - static_cast(tmp_dev_ins_col_data->ptr()); - - ConcatKernel<<>>( - dev_ins_data, dev_ins_col_data, static_cast(inputs_col_num), - out_row, out_col, output->data()); - } - -#ifdef PADDLE_WITH_HIP - // Prevent the pinned memory value from being covered and release the memory - // after the launch kernel of the stream is executed (reapply pinned memory - // next time) - auto* data_alloc_released = data_alloc.release(); - auto* col_alloc_released = col_alloc.release(); - context.AddStreamCallback([data_alloc_released, col_alloc_released] { - memory::allocation::Allocator::AllocationDeleter(data_alloc_released); - memory::allocation::Allocator::AllocationDeleter(col_alloc_released); - }); -#endif + pten::ConcatImpl(context, pt_input, axis, + output); } }; @@ -355,120 +48,12 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, int axis, std::vector* outputs) { - // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 - // tensors of shape [0,1,4] - if (input.numel() == 0) { - return; - } - - // TODO(zcd): Add input data validity checking - int o_num = outputs->size(); - int64_t out_row = 1; - auto dim_0 = ref_inputs[0]->dims(); - for (int i = 0; i < axis; ++i) { - out_row *= dim_0[i]; - } - - int64_t out0_col = ref_inputs[0]->numel() / out_row; - int64_t in_col = 0, in_row = out_row; - bool has_same_shape = true; - - int outputs_cols_num = o_num + 1; - std::vector outputs_data_vec(o_num); - std::vector outputs_cols_vec(outputs_cols_num); - T** outputs_data = outputs_data_vec.data(); - int64_t* outputs_cols = outputs_cols_vec.data(); - -// There are some differences between hip runtime and NV runtime. -// In NV, when the pageable memory data less than 64K is transferred from -// hosttodevice, it will be automatically asynchronous. -// However, only pinned memory in hip can copy asynchronously -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device -// 3.2.6.1. Concurrent Execution between Host and Device -// Memory copies from host to device of a memory block of 64 KB or less -#ifdef PADDLE_WITH_HIP - memory::AllocationPtr data_alloc, cols_alloc; - data_alloc = memory::Alloc(platform::CUDAPinnedPlace(), o_num * sizeof(T*)); - outputs_data = reinterpret_cast(data_alloc->ptr()); - cols_alloc = memory::Alloc(platform::CUDAPinnedPlace(), - (outputs_cols_num) * sizeof(int64_t)); - outputs_cols = reinterpret_cast(cols_alloc->ptr()); -#endif - - outputs_cols[0] = 0; - for (int i = 0; i < o_num; ++i) { - int64_t t_col = ref_inputs.at(i)->numel() / out_row; - if (has_same_shape) { - if (t_col != out0_col) has_same_shape = false; - } - in_col += t_col; - outputs_cols[i + 1] = in_col; - if (outputs->at(i) != nullptr) { - outputs_data[i] = outputs->at(i)->data(); - } else { - outputs_data[i] = nullptr; - } - } - - dim3 block_dims; - dim3 grid_dims; - GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims); - - memory::allocation::AllocationPtr tmp_dev_outs_data; - T** dev_out_gpu_data = nullptr; - if (!has_same_shape || o_num < 2 || o_num > 4) { - tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*)); - auto* restored = - platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num); - memory::Copy(context.GetPlace(), tmp_dev_outs_data->ptr(), - platform::CPUPlace(), restored, o_num * sizeof(T*), - context.stream()); - dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); - } - - if (has_same_shape) { - if (o_num == 2) { - SplitKernel<<>>( - input.data(), in_row, in_col, out0_col, outputs_data[0], - outputs_data[1]); - } else if (o_num == 3) { - SplitKernel<<>>( - input.data(), in_row, in_col, out0_col, outputs_data[0], - outputs_data[1], outputs_data[2]); - } else if (o_num == 4) { - SplitKernel<<>>( - input.data(), in_row, in_col, out0_col, outputs_data[0], - outputs_data[1], outputs_data[2], outputs_data[3]); - } else { - SplitKernel<<>>( - input.data(), in_row, in_col, out0_col, dev_out_gpu_data); - } - } else { - auto tmp_dev_ins_col_data = - memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); - auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph( - outputs_cols, outputs_cols_num); - memory::Copy(context.GetPlace(), tmp_dev_ins_col_data->ptr(), - platform::CPUPlace(), restored, - outputs_cols_num * sizeof(int64_t), context.stream()); - int64_t* dev_outs_col_data = - reinterpret_cast(tmp_dev_ins_col_data->ptr()); - - SplitKernel<<>>( - input.data(), in_row, in_col, dev_outs_col_data, - static_cast(outputs_cols_num), dev_out_gpu_data); - } -#ifdef PADDLE_WITH_HIP - // Prevent the pinned memory value from being covered and release the memory - // after the launch kernel of the stream is executed (reapply pinned memory - // next time) - auto* data_alloc_released = data_alloc.release(); - auto* cols_alloc_released = cols_alloc.release(); - context.AddStreamCallback([data_alloc_released, cols_alloc_released] { - memory::allocation::Allocator::AllocationDeleter(data_alloc_released); - memory::allocation::Allocator::AllocationDeleter(cols_alloc_released); - }); -#endif + std::vector pt_ref_inputs{ref_inputs.begin(), + ref_inputs.end()}; + std::vector pt_outputs{outputs->begin(), + outputs->end()}; + pten::SplitImpl( + context, input, pt_ref_inputs, axis, &pt_outputs); } }; diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 653283b604f07..5ebaefcf808c3 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/pten/core/lod_utils.h" + namespace pten { class DenseTensor; } // namespace pten @@ -122,7 +124,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { input->lod(), *in_idx, (*in_idx) + 1, 0); auto &lod_length = lod_and_offset.first; - framework::AppendLoD(out_lod, lod_length); + pten::AppendLoD(out_lod, lod_length); size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc index f39a1c0a39d6e..493073fadc2bd 100644 --- a/paddle/fluid/operators/shrink_rnn_memory_op.cc +++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/array_operator.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/pten/core/lod_utils.h" + namespace paddle { namespace framework { class OpDesc; @@ -73,7 +75,7 @@ class ShrinkRNNMemoryOp : public ArrayOp { dst_num_rows, 0); height = lod_offset.second.second; auto out_lod = out_tensor.mutable_lod(); - framework::AppendLoD(out_lod, lod_offset.first); + pten::AppendLoD(out_lod, lod_offset.first); } if (dst_num_rows != 0) { diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc index 9c22fa4797219..4cb2a292018f6 100644 --- a/paddle/fluid/operators/split_lod_tensor_op.cc +++ b/paddle/fluid/operators/split_lod_tensor_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/pten/core/lod_utils.h" namespace pten { class DenseTensor; @@ -96,7 +97,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { x_lod, start_idx, start_idx + 1, level); auto &lod_length = lod_and_offset.first; - framework::AppendLoD(lod, lod_length); + pten::AppendLoD(lod, lod_length); size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index cdbfa11abec72..454e3b524f5f1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -43,7 +43,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/generate_pass.h" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/new_executor/standalone_executor.h" #include "paddle/fluid/framework/op_info.h" @@ -75,6 +74,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/pten/core/lod_utils.h" #ifndef PADDLE_ON_INFERENCE #include "paddle/fluid/pybind/eager.h" #endif @@ -1093,7 +1093,7 @@ PYBIND11_MODULE(core_noavx, m) { .def("recursive_sequence_lengths", [](framework::Tensor &self) -> std::vector> { // output the length-based lod info - LoD lod = ConvertToLengthBasedLoD(self.lod()); + LoD lod = pten::ConvertToLengthBasedLoD(self.lod()); std::vector> new_lod; new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt index 9b6e5d70cd899..cde5e719e316d 100644 --- a/paddle/pten/CMakeLists.txt +++ b/paddle/pten/CMakeLists.txt @@ -18,7 +18,7 @@ add_subdirectory(ops) add_subdirectory(tests) # make an unity target for compile deps -set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta) +set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils) get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) # keep this message for debug, remove it later if needless message(STATUS "All standard pten kernels: ${pten_kernels}") diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h index 0b17415a6a98d..e3929d59159c1 100644 --- a/paddle/pten/api/include/kernel_signature.h +++ b/paddle/pten/api/include/kernel_signature.h @@ -38,6 +38,11 @@ using cast_kernel = void (*)(const DeviceContext&, DataType, DenseTensor*); +using concat_kernel = void (*)(const DeviceContext&, + const std::vector&, + const Scalar&, + DenseTensor*); + using divide_kernel = void (*)(const DeviceContext&, const DenseTensor&, const DenseTensor&, diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc index 1420810007d1c..2e94d508aec7d 100644 --- a/paddle/pten/api/lib/utils/tensor_utils.cc +++ b/paddle/pten/api/lib/utils/tensor_utils.cc @@ -38,6 +38,11 @@ std::unique_ptr MakePtenDenseTensorBase( src.dims(), src.layout(), src.offset()}; + if (!src.IsInitialized()) { + return std::make_unique( + std::move(pten::make_intrusive(src.place())), + std::move(meta)); + } auto shared_storage = pten::make_intrusive(src.Holder()); return std::make_unique(std::move(shared_storage), std::move(meta)); @@ -247,7 +252,9 @@ std::unique_ptr MakePtenTensorBaseFromVar( if (variable.IsType()) { const auto& tensor = variable.Get(); - if (!platform::is_same_place(tensor.place(), expected_place)) { + + if (tensor.IsInitialized() && + !platform::is_same_place(tensor.place(), expected_place)) { framework::LoDTensor tmp_tensor; framework::TensorCopySync(tensor, expected_place, &tmp_tensor); return MakePtenDenseTensor(tmp_tensor); diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt index eabc5a19babad..d89b3c9fefb59 100644 --- a/paddle/pten/core/CMakeLists.txt +++ b/paddle/pten/core/CMakeLists.txt @@ -12,6 +12,7 @@ cc_library(arg_map_context SRCS arg_map_context.cc DEPS enforce) cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce) cc_library(tensor_meta SRCS tensor_meta.cc DEPS enforce mixed_vector) +cc_library(lod_utils SRCS lod_utils.cc DEPS enforce mixed_vector) cc_library(dense_tensor SRCS dense_tensor.cc DEPS convert_utils tensor_meta tensor_base) cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base ) diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h index 5559b348aa1c9..5dd2bf367b3b8 100644 --- a/paddle/pten/core/kernel_context.h +++ b/paddle/pten/core/kernel_context.h @@ -92,7 +92,7 @@ class KernelContext { std::vector MoveInputsBetween(size_t start, size_t end) { std::vector v; for (size_t i = start; i < end; ++i) { - auto t = std::dynamic_pointer_cast(inputs_.at(i)); + auto t = static_cast(inputs_.at(i)); v.emplace_back(*t); inputs_.at(i) = nullptr; } diff --git a/paddle/pten/core/lod_utils.cc b/paddle/pten/core/lod_utils.cc new file mode 100644 index 0000000000000..ad5ea6d39d39c --- /dev/null +++ b/paddle/pten/core/lod_utils.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/core/lod_utils.h" + +#include "paddle/fluid/platform/enforce.h" + +namespace pten { + +void AppendLoD(LoD *lod, const LoD &lod_length) { + PADDLE_ENFORCE( + lod->empty() || lod->size() == lod_length.size(), + paddle::platform::errors::InvalidArgument( + "The input LoD length should be equal to the appended LoD size, but " + "received input LoD length is %d, actual LoD size is %d.", + lod_length.size(), + lod->size())); + if (lod->empty()) { + for (size_t i = 0; i < lod_length.size(); ++i) { + lod->emplace_back(1, 0); // size = 1, value = 0; + } + *lod = LoD(lod_length.size(), std::vector({0})); + } + for (size_t i = 0; i < lod->size(); ++i) { + auto &level = (*lod)[i]; + for (size_t len : lod_length[i]) { + level.push_back(level.back() + len); + } + } +} + +LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { + LoD length_lod; + length_lod.reserve(offset_lod.size()); + for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { + std::vector level; + if (offset_lod[lvl].size() > 0) { + level.reserve(offset_lod[lvl].size() - 1); + } + for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { + level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); + } + length_lod.push_back(level); + } + return length_lod; +} + +} // namespace pten diff --git a/paddle/pten/core/lod_utils.h b/paddle/pten/core/lod_utils.h new file mode 100644 index 0000000000000..4c2547a43c027 --- /dev/null +++ b/paddle/pten/core/lod_utils.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/framework/mixed_vector.h" + +namespace pten { +using LoD = std::vector>; + +void AppendLoD(LoD* lod, const LoD& lod_length); + +/* + * Convert between length-based LoD and offset-based LoD. + * The implementation of LoDTensor class use offset-based LoD. + * However, we want to expose the more user-friendly length-based + * LoD to the Python side instead. + * + * Example: + * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]] + * then length_lod = [[2, 1], [3, 2, 4]] + */ +LoD ConvertToLengthBasedLoD(const LoD& offset_lod); + +} // namespace pten diff --git a/paddle/pten/infermeta/multiary.cc b/paddle/pten/infermeta/multiary.cc index 5dbf3d58a1952..ecd0396a28688 100644 --- a/paddle/pten/infermeta/multiary.cc +++ b/paddle/pten/infermeta/multiary.cc @@ -14,4 +14,43 @@ limitations under the License. */ #include "paddle/pten/infermeta/multiary.h" -namespace pten {} // namespace pten +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/kernels/funcs/concat_funcs.h" +namespace pten { + +DenseTensorMeta ConcatInferMeta(const std::vector& x_meta, + const Scalar& axis_scalar, + bool is_runtime) { + PADDLE_ENFORCE_GE(x_meta.size(), + 0, + paddle::platform::errors::InvalidArgument( + "The size of input meta vector should be greater" + "than 0.")); + + int axis = axis_scalar.to(); + // 1. calculate axis + int rank = x_meta[0].dims.size(); + PADDLE_ENFORCE_EQ( + axis >= -rank && axis < rank, + true, + paddle::platform::errors::InvalidArgument( + "The axis is expected to be in range of [%d, %d), but got %d", + -rank, + rank, + axis)); + if (axis < 0) { + axis = axis + rank; + } + + // 2. calculate out dims + std::vector x_dims; + for (auto meta : x_meta) { + x_dims.push_back(meta.dims); + } + pten::DDim out_dim = + pten::funcs::ComputeAndCheckShape(is_runtime, x_dims, axis); + + return {x_meta[0].dtype, out_dim, x_meta[0].layout}; +} + +} // namespace pten diff --git a/paddle/pten/infermeta/multiary.h b/paddle/pten/infermeta/multiary.h index 6aa15159630bc..f8d5468e50d47 100644 --- a/paddle/pten/infermeta/multiary.h +++ b/paddle/pten/infermeta/multiary.h @@ -14,4 +14,13 @@ limitations under the License. */ #pragma once -namespace pten {} // namespace pten +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/tensor_meta.h" +namespace pten { + +// TODO(chentianyu03) use std::vector as InferMeta inputs +DenseTensorMeta ConcatInferMeta(const std::vector& x_meta, + const Scalar& axis_scalar, + bool is_runtime); + +} // namespace pten diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index 45724e5d22abd..76e112808892d 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -24,7 +24,7 @@ endif() # pten depends all pten kernel targets set_property(GLOBAL PROPERTY PTEN_KERNELS "") -set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils) +set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) diff --git a/paddle/pten/kernels/concat_kernel.h b/paddle/pten/kernels/concat_kernel.h new file mode 100644 index 0000000000000..310b9ba8c0c4c --- /dev/null +++ b/paddle/pten/kernels/concat_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/infermeta/multiary.h" +#include "paddle/pten/kernels/empty_kernel.h" +namespace pten { + +template +void ConcatKernel(const Context& dev_ctx, + const std::vector& x, + const Scalar& axis, + DenseTensor* out); + +template +DenseTensor Concat(const Context& dev_ctx, + const std::vector& x, + const Scalar& axis) { + std::vector x_meta; + for (auto t : x) { + x_meta.push_back(t.meta()); + } + + auto out_meta = ConcatInferMeta(x_meta, axis.to(), true); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + ConcatKernel(dev_ctx, x, axis, &dense_out); + return dense_out; +} +} // namespace pten diff --git a/paddle/pten/kernels/cpu/concat_and_split.h b/paddle/pten/kernels/cpu/concat_and_split.h new file mode 100644 index 0000000000000..664ec6f66fc99 --- /dev/null +++ b/paddle/pten/kernels/cpu/concat_and_split.h @@ -0,0 +1,138 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +/* + * \brief Concatenate the input tensors along the dimension axis. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input[0] = [[1,2],[3,4]] + * Input[1] = [[5,6]] + * axis = 0 + * + * Output = [[1,2], + * [3,4], + * [5,6]] + */ + +template +void ConcatImpl(const Context& context, + const std::vector& input, + int axis, + DenseTensor* output) { + // TODO(zcd): Add input data validity checking + size_t num = input.size(); + + int64_t rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int64_t out_rows = rows, out_cols = 0; + + std::vector input_cols(input.size()); + for (size_t i = 0; i < num; ++i) { + int64_t t_cols = input[i].numel() / rows; + out_cols += t_cols; + input_cols[i] = t_cols; + } + auto cpu_place = context.GetPlace(); + + // computation + auto output_data = output->data(); + int64_t col_idx = 0; + for (size_t j = 0; j < num; ++j) { + int64_t col_len = input_cols[j]; + auto input_data = input[j].data(); + for (int64_t k = 0; k < out_rows; ++k) { + paddle::memory::Copy(cpu_place, + output_data + k * out_cols + col_idx, + cpu_place, + input_data + k * col_len, + sizeof(T) * col_len); + } + col_idx += col_len; + } +} + +/* + * \brief Split the input tensors along the dimension axis into outputs. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input = [[1,2], + * [3,4], + * [5,6]] + * axis = 0 + * + * Output[0] = [[1,2],[3,4]] + * Output[1] = [[5,6]] + */ +template +void SplitImpl(const Context& context, + const DenseTensor& input, + const std::vector& ref_inputs, + const int axis, + std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + + // TODO(zcd): Add input data validity checking + size_t num = outputs->size(); + + int input_rows = 1; + auto dim_0 = ref_inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + input_rows *= dim_0[i]; + } + + int input_cols = 0; + + std::vector output_cols(outputs->size()); + for (size_t i = 0; i < num; ++i) { + int t_cols = ref_inputs[i]->numel() / input_rows; + input_cols += t_cols; + output_cols[i] = t_cols; + } + auto cpu_place = context.GetPlace(); + + // computation + for (int k = 0; k < input_rows; ++k) { + const T* src_ptr = input.data() + k * input_cols; + int col_idx = 0; + for (size_t j = 0; j < num; ++j) { + int col_len = output_cols[j]; + auto* out_tensor = outputs->at(j); + if (out_tensor != nullptr) { + T* dst_ptr = out_tensor->data() + k * col_len; + paddle::memory::Copy(cpu_place, + dst_ptr, + cpu_place, + src_ptr + col_idx, + sizeof(T) * col_len); + } + col_idx += col_len; + } + } +} + +} // namespace pten diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc new file mode 100644 index 0000000000000..fb59c9c6005ff --- /dev/null +++ b/paddle/pten/kernels/cpu/concat_kernel.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/concat_kernel.h" + +#include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/core/lod_utils.h" +#include "paddle/pten/kernels/cpu/concat_and_split.h" +#include "paddle/pten/kernels/funcs/concat_funcs.h" + +namespace pten { + +template +void ConcatKernel(const Context& dev_ctx, + const std::vector& x, + const Scalar& axis_scalar, + DenseTensor* out) { + int64_t axis = axis_scalar.to(); + + axis = pten::funcs::ComputeAxis(axis, x[0].dims().size()); + + std::vector x_dims; + for (size_t i = 0; i < x.size(); ++i) { + x_dims.push_back(x[i].dims()); + } + + pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis); + out->Resize(out_dims); + out->mutable_data(); + + // If axis is 0, the lod of the output is not the same as inputs. + if (axis == 0 && x[0].lod().size() > 0) { + size_t lod_size_0 = x[0].lod().size(); + size_t lod_size = lod_size_0; + for (size_t i = 1; i < x.size(); ++i) { + if (x[i].lod().size() > 0) { + PADDLE_ENFORCE_EQ( + x[i].lod().size(), + lod_size_0, + paddle::platform::errors::Unimplemented( + "The lod level of all input LoDTensors should be same. " + "Maybe different lod level of input LoDTensors can concat," + "it is not supported currently. The lod level of %dth input " + "is %d and first input is %d.", + i, + x[i].lod().size(), + lod_size_0)); + } else { + lod_size = 0; + break; + } + } + if (lod_size) { + auto* out_lod = out->mutable_lod(); + for (size_t i = 1; i < x.size(); ++i) { + auto in_lod = pten::ConvertToLengthBasedLoD(x[i].lod()); + pten::AppendLoD(out_lod, in_lod); + } + } + } + + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && x.size() < 10) { + size_t output_offset = 0; + for (auto& in : x) { + if (in.numel() == 0UL) { + continue; + } + auto in_stride = paddle::framework::stride_numel(in.dims()); + auto out_stride = paddle::framework::stride_numel(out->dims()); + paddle::operators::StridedNumelCopyWithAxis( + dev_ctx, + axis, + out->data() + output_offset, + out_stride, + in.data(), + in_stride, + in_stride[axis]); + output_offset += in_stride[axis]; + } + } else { + std::vector inputs; + for (size_t j = 0; j < x.size(); ++j) { + if (x[j].numel() > 0) { + inputs.push_back(x[j]); + } else { + continue; + } + } + ConcatImpl(dev_ctx, inputs, axis, out); + } +} + +} // namespace pten + +PT_REGISTER_KERNEL(concat, + CPU, + ALL_LAYOUT, + pten::ConcatKernel, + float, + double, + bool, + int64_t, + int, + uint8_t, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/kernels/funcs/concat_funcs.h b/paddle/pten/kernels/funcs/concat_funcs.h new file mode 100644 index 0000000000000..8455b8096922c --- /dev/null +++ b/paddle/pten/kernels/funcs/concat_funcs.h @@ -0,0 +1,95 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/errors.h" +namespace pten { +namespace funcs { + +static inline int64_t ComputeAxis(int64_t axis, int64_t rank) { + PADDLE_ENFORCE_EQ( + axis >= -rank && axis < rank, + true, + paddle::platform::errors::InvalidArgument( + "The axis is expected to be in range of [%d, %d), but got %d", + -rank, + rank, + axis)); + if (axis < 0) { + axis = axis + rank; + } + return axis > 0 ? axis : 0; +} + +static inline pten::DDim ComputeAndCheckShape( + const bool is_runtime, + const std::vector& inputs_dims, + const size_t axis) { + const size_t n = inputs_dims.size(); + auto out_dims = inputs_dims[0]; + size_t in_zero_dims_size = out_dims.size(); + for (size_t i = 1; i < n; i++) { + PADDLE_ENFORCE_EQ(inputs_dims[i].size(), + out_dims.size(), + paddle::platform::errors::InvalidArgument( + "The shape of input[0] and input[%d] " + "is expected to be equal." + "But received input[0]'s shape = " + "[%s], input[%d]'s shape = [%s].", + i, + inputs_dims[0], + i, + inputs_dims[i])); + for (size_t j = 0; j < in_zero_dims_size; j++) { + if (j == axis) { + if (is_runtime) { + out_dims[axis] += inputs_dims[i][j]; + } else { + if (inputs_dims[i][j] == -1 || out_dims[j] == -1) { + out_dims[axis] = -1; + } else { + out_dims[axis] += inputs_dims[i][j]; + } + } + } else { + bool check_shape = + is_runtime || (inputs_dims[0][j] > 0 && inputs_dims[i][j] > 0); + if (check_shape) { + // check all shape in run time + PADDLE_ENFORCE_EQ(inputs_dims[0][j], + inputs_dims[i][j], + paddle::platform::errors::InvalidArgument( + "The %d-th dimension of input[0] and input[%d] " + "is expected to be equal." + "But received input[0]'s shape = " + "[%s], input[%d]'s shape = [%s].", + j, + i, + inputs_dims[0], + i, + inputs_dims[i])); + } + if (!is_runtime && out_dims[j] == -1 && inputs_dims[i][j] > 0) { + out_dims[j] = inputs_dims[i][j]; + } + } + } + } + return out_dims; +} + +} // namespace funcs +} // namespace pten diff --git a/paddle/pten/kernels/gpu/concat_and_split.h b/paddle/pten/kernels/gpu/concat_and_split.h new file mode 100644 index 0000000000000..66b21b5f51351 --- /dev/null +++ b/paddle/pten/kernels/gpu/concat_and_split.h @@ -0,0 +1,569 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" + +namespace pten { + +template +__global__ void ConcatKernel_(const T** inputs, + const int64_t* input_cols, + int col_size, + const int64_t output_rows, + const int64_t output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = input_cols[0]; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = input_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = input_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + + const T* input_ptr = inputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * segment_width + local_col]; + } +} + +template +__device__ void ConcatKernelDetail(const T** inputs_data, + const int fixed_in_col, + const int out_rows, + const int out_cols, + T* output_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * 1.0 / fixed_in_col; + int in_offset = tid_x - split * fixed_in_col; + const T* input_ptr = inputs_data[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) { + output_data[tid_y * out_cols + tid_x] = + input_ptr[tid_y * fixed_in_col + in_offset]; + } + } +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[2]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const T* input_addr2, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[3]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + inputs_data[2] = input_addr2; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T* input_addr0, + const T* input_addr1, + const T* input_addr2, + const T* input_addr3, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + const T* inputs_data[4]; + inputs_data[0] = input_addr0; + inputs_data[1] = input_addr1; + inputs_data[2] = input_addr2; + inputs_data[3] = input_addr3; + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void ConcatKernel_(const T** inputs_data, + const int in_num, + const int64_t fixed_in_col, + const int64_t out_rows, + const int64_t out_cols, + T* output_data) { + ConcatKernelDetail( + inputs_data, fixed_in_col, out_rows, out_cols, output_data); +} + +template +__global__ void SplitKernel(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t* out_cols, + int out_cols_size, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__device__ void SplitKernelDetail(const T* input_data, + const int in_row, + const int in_col, + const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T** outputs_data) { + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1) { + T* outputs_data[2]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1, + T* outputs_addr2) { + T* outputs_data[3]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + outputs_data[2] = outputs_addr2; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +template +__global__ void SplitKernel(const T* input_data, + const int64_t in_row, + const int64_t in_col, + const int64_t fixed_out_col, + T* outputs_addr0, + T* outputs_addr1, + T* outputs_addr2, + T* outputs_addr3) { + T* outputs_data[4]; + outputs_data[0] = outputs_addr0; + outputs_data[1] = outputs_addr1; + outputs_data[2] = outputs_addr2; + outputs_data[3] = outputs_addr3; + SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); +} + +static inline void GetBlockDims( + const paddle::platform::CUDADeviceContext& context, + int64_t num_rows, + int64_t num_cols, + dim3* block_dims, + dim3* grid_dims) { + // Set the thread block and grid according to CurrentDeviceId + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (num_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((num_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + *block_dims = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((num_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = std::min(max_blocks / grid_cols, + std::max(num_rows / block_rows, (int64_t)1)); + *grid_dims = dim3(grid_cols, grid_rows, 1); +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +void ConcatImpl(const Context& context, + const std::vector& input, + int axis, + pten::DenseTensor* output) { + // TODO(zcd): Add input data validity checking + int in_num = input.size(); + int64_t in_row = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + in_row *= dim_0[i]; + } + int64_t in_col = input[0].numel() / in_row; + int64_t out_row = in_row, out_col = 0; + + int inputs_col_num = in_num + 1; + std::vector inputs_data_vec(in_num); + std::vector inputs_col_vec(inputs_col_num); + const T** inputs_data = inputs_data_vec.data(); + int64_t* inputs_col = inputs_col_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, col_alloc; + // TODO(chentianyu03): try to find a method to remove the Alloc function + data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + in_num * sizeof(T*)); + inputs_data = reinterpret_cast(data_alloc->ptr()); + // TODO(chentianyu03): try to find a method to remove the Alloc function + col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + inputs_col_num * sizeof(int)); + inputs_col = reinterpret_cast(col_alloc->ptr()); +#endif + + inputs_col[0] = 0; + bool has_same_shape = true; + for (int i = 0; i < in_num; ++i) { + int64_t t_cols = input[i].numel() / in_row; + if (has_same_shape) { + if (t_cols != in_col) has_same_shape = false; + } + out_col += t_cols; + inputs_col[i + 1] = out_col; + inputs_data[i] = input[i].data(); + } + + dim3 block_dims; + dim3 grid_dims; + GetBlockDims(context, out_row, out_col, &block_dims, &grid_dims); + + paddle::memory::allocation::AllocationPtr tmp_dev_ins_data; + const T** dev_ins_data = nullptr; + if (!has_same_shape || in_num < 2 || in_num > 4) { + tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_data, in_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_data->ptr(), + paddle::platform::CPUPlace(), + restored, + in_num * sizeof(T*), + context.stream()); + dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); + } + + if (has_same_shape) { + if (in_num == 2) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + in_col, + out_row, + out_col, + output->data()); + } else if (in_num == 3) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + inputs_data[2], + in_col, + out_row, + out_col, + output->data()); + } else if (in_num == 4) { + ConcatKernel_<<>>( + inputs_data[0], + inputs_data[1], + inputs_data[2], + inputs_data[3], + in_col, + out_row, + out_col, + output->data()); + } else { + ConcatKernel_<<>>( + dev_ins_data, in_num, in_col, out_row, out_col, output->data()); + } + } else { + auto tmp_dev_ins_col_data = + paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t)); + + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + inputs_col, inputs_col_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_col_data->ptr(), + paddle::platform::CPUPlace(), + restored, + inputs_col_num * sizeof(int64_t), + context.stream()); + int64_t* dev_ins_col_data = + static_cast(tmp_dev_ins_col_data->ptr()); + + ConcatKernel_<<>>( + dev_ins_data, + dev_ins_col_data, + static_cast(inputs_col_num), + out_row, + out_col, + output->data()); + } + +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* col_alloc_released = col_alloc.release(); + context.AddStreamCallback([data_alloc_released, col_alloc_released] { + paddle::memory::allocation::Allocator::AllocationDeleter( + data_alloc_released); + paddle::memory::allocation::Allocator::AllocationDeleter( + col_alloc_released); + }); +#endif +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension must be the same, except the axis dimension. + */ +template +void SplitImpl(const Context& context, + const pten::DenseTensor& input, + const std::vector& ref_inputs, + int axis, + std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + + // TODO(zcd): Add input data validity checking + int o_num = outputs->size(); + int64_t out_row = 1; + auto dim_0 = ref_inputs[0]->dims(); + for (int i = 0; i < axis; ++i) { + out_row *= dim_0[i]; + } + + int64_t out0_col = ref_inputs[0]->numel() / out_row; + int64_t in_col = 0, in_row = out_row; + bool has_same_shape = true; + + int outputs_cols_num = o_num + 1; + std::vector outputs_data_vec(o_num); + std::vector outputs_cols_vec(outputs_cols_num); + T** outputs_data = outputs_data_vec.data(); + int64_t* outputs_cols = outputs_cols_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + paddle::memory::AllocationPtr data_alloc, cols_alloc; + // TODO(chentianyu03): try to find a method to remove the Alloc function + data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + o_num * sizeof(T*)); + outputs_data = reinterpret_cast(data_alloc->ptr()); + // TODO(chentianyu03): try to find a method to remove the Alloc function + cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(), + (outputs_cols_num) * sizeof(int64_t)); + outputs_cols = reinterpret_cast(cols_alloc->ptr()); +#endif + + outputs_cols[0] = 0; + for (int i = 0; i < o_num; ++i) { + int64_t t_col = ref_inputs.at(i)->numel() / out_row; + if (has_same_shape) { + if (t_col != out0_col) has_same_shape = false; + } + in_col += t_col; + outputs_cols[i + 1] = in_col; + if (outputs->at(i) != nullptr) { + outputs_data[i] = outputs->at(i)->data(); + } else { + outputs_data[i] = nullptr; + } + } + + dim3 block_dims; + dim3 grid_dims; + GetBlockDims(context, out_row, in_col, &block_dims, &grid_dims); + + paddle::memory::allocation::AllocationPtr tmp_dev_outs_data; + T** dev_out_gpu_data = nullptr; + if (!has_same_shape || o_num < 2 || o_num > 4) { + // TODO(chentianyu03): try to find a method to remove the Alloc function + tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_data, o_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_outs_data->ptr(), + paddle::platform::CPUPlace(), + restored, + o_num * sizeof(T*), + context.stream()); + dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); + } + + if (has_same_shape) { + if (o_num == 2) { + SplitKernel<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1]); + } else if (o_num == 3) { + SplitKernel<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1], + outputs_data[2]); + } else if (o_num == 4) { + SplitKernel<<>>( + input.data(), + in_row, + in_col, + out0_col, + outputs_data[0], + outputs_data[1], + outputs_data[2], + outputs_data[3]); + } else { + SplitKernel<<>>( + input.data(), in_row, in_col, out0_col, dev_out_gpu_data); + } + } else { + auto tmp_dev_ins_col_data = + // TODO(chentianyu03): try to find a method to remove the Alloc function + paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); + auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( + outputs_cols, outputs_cols_num); + paddle::memory::Copy(context.GetPlace(), + tmp_dev_ins_col_data->ptr(), + paddle::platform::CPUPlace(), + restored, + outputs_cols_num * sizeof(int64_t), + context.stream()); + int64_t* dev_outs_col_data = + reinterpret_cast(tmp_dev_ins_col_data->ptr()); + + SplitKernel<<>>( + input.data(), + in_row, + in_col, + dev_outs_col_data, + static_cast(outputs_cols_num), + dev_out_gpu_data); + } +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* cols_alloc_released = cols_alloc.release(); + context.AddStreamCallback([data_alloc_released, cols_alloc_released] { + paddle::memory::allocation::Allocator::AllocationDeleter( + data_alloc_released); + paddle::memory::allocation::Allocator::AllocationDeleter( + cols_alloc_released); + }); +#endif +} + +} // namespace pten diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu new file mode 100644 index 0000000000000..6ddfef460fc6c --- /dev/null +++ b/paddle/pten/kernels/gpu/concat_kernel.cu @@ -0,0 +1,125 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/concat_kernel.h" + +#include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/core/lod_utils.h" +#include "paddle/pten/kernels/funcs/concat_funcs.h" +#include "paddle/pten/kernels/gpu/concat_and_split.h" + +namespace pten { + +template +void ConcatKernel(const Context& dev_ctx, + const std::vector& x, + const Scalar& axis_scalar, + DenseTensor* out) { + int64_t axis = axis_scalar.to(); + + axis = pten::funcs::ComputeAxis(axis, x[0].dims().size()); + + std::vector x_dims; + for (size_t i = 0; i < x.size(); ++i) { + x_dims.push_back(x[i].dims()); + } + + pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis); + out->Resize(out_dims); + out->mutable_data(); + + // If axis is 0, the lod of the output is not the same as inputs. + if (axis == 0 && x[0].lod().size() > 0) { + size_t lod_size_0 = x[0].lod().size(); + size_t lod_size = lod_size_0; + for (size_t i = 1; i < x.size(); ++i) { + if (x[i].lod().size() > 0) { + PADDLE_ENFORCE_EQ( + x[i].lod().size(), + lod_size_0, + paddle::platform::errors::Unimplemented( + "The lod level of all input LoDTensors should be same. " + "Maybe different lod level of input LoDTensors can concat," + "it is not supported currently. The lod level of %dth input " + "is %d and first input is %d.", + i, + x[i].lod().size(), + lod_size_0)); + } else { + lod_size = 0; + break; + } + } + if (lod_size) { + auto* out_lod = out->mutable_lod(); + for (size_t i = 1; i < x.size(); ++i) { + auto in_lod = pten::ConvertToLengthBasedLoD(x[i].lod()); + pten::AppendLoD(out_lod, in_lod); + } + } + } + + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && x.size() < 10) { + size_t output_offset = 0; + for (auto& in : x) { + if (in.numel() == 0UL) { + continue; + } + auto in_stride = paddle::framework::stride_numel(in.dims()); + auto out_stride = paddle::framework::stride_numel(out->dims()); + paddle::operators::StridedNumelCopyWithAxis( + dev_ctx, + axis, + out->data() + output_offset, + out_stride, + in.data(), + in_stride, + in_stride[axis]); + output_offset += in_stride[axis]; + } + } else { + std::vector inputs; + for (size_t j = 0; j < x.size(); ++j) { + if (x[j].numel() > 0) { + inputs.push_back(x[j]); + } else { + continue; + } + } + ConcatImpl(dev_ctx, inputs, axis, out); + } +} + +} // namespace pten + +PT_REGISTER_KERNEL(concat, + GPU, + ALL_LAYOUT, + pten::ConcatKernel, + float, + double, + bool, + int64_t, + int, + uint8_t, + paddle::platform::float16, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt index 79d9a3d82e69e..e9faa22c4eb7b 100644 --- a/paddle/pten/tests/api/CMakeLists.txt +++ b/paddle/pten/tests/api/CMakeLists.txt @@ -21,3 +21,4 @@ cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_uti cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils) cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils) cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils) +cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils) diff --git a/paddle/pten/tests/api/test_concat_api.cc b/paddle/pten/tests/api/test_concat_api.cc new file mode 100644 index 0000000000000..e84aee0aaaf4f --- /dev/null +++ b/paddle/pten/tests/api/test_concat_api.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/pten/api/include/api.h" + +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" + +namespace paddle { +namespace tests { + +namespace framework = paddle::framework; +using DDim = paddle::framework::DDim; + +// TODO(chentianyu03): Remove this test after the API is used in the dygraph +TEST(API, concat) { + // 1. create tensor + const auto alloc = std::make_unique( + paddle::platform::CPUPlace()); + auto dense_x = std::make_shared( + alloc.get(), + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x->mutable_data(); + + auto dense_y = std::make_shared( + alloc.get(), + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y->mutable_data(); + + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0; + } + } + + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Tensor y(dense_y); + + std::vector inputs{x, y}; + + // 2. test API + auto out = paddle::experimental::concat(inputs, 0); + + // 3. check result + ASSERT_EQ(out.dims().size(), 2); + ASSERT_EQ(out.dims()[0], 6); + ASSERT_EQ(out.dims()[1], 10); + ASSERT_EQ(out.numel(), 60); + ASSERT_EQ(out.is_cpu(), true); + ASSERT_EQ(out.type(), pten::DataType::FLOAT32); + ASSERT_EQ(out.layout(), pten::DataLayout::NCHW); + ASSERT_EQ(out.initialized(), true); + + auto dense_out = std::dynamic_pointer_cast(out.impl()); + auto out_data = dense_out->data(); + for (size_t i = 0; i < 60; ++i) { + if (i < 30) { + ASSERT_NEAR(dense_x_data[i], out_data[i], 1e-6f); + } else { + ASSERT_NEAR(dense_y_data[i - 30], out_data[i], 1e-6f); + } + } +} + +} // namespace tests +} // namespace paddle diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt index 6f70f2ca2c895..407e5c097aec4 100644 --- a/paddle/pten/tests/kernels/CMakeLists.txt +++ b/paddle/pten/tests/kernels/CMakeLists.txt @@ -10,3 +10,4 @@ cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils) cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils) cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils) +cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils) diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc new file mode 100644 index 0000000000000..c5d979ad908ff --- /dev/null +++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/pten/kernels/concat_kernel.h" + +#include "paddle/pten/api/lib/utils/allocator.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" + +namespace pten { +namespace tests { + +namespace framework = paddle::framework; +using DDim = paddle::framework::DDim; + +TEST(DEV_API, concat) { + // 1. create tensor + const auto alloc = std::make_unique( + paddle::platform::CPUPlace()); + pten::DenseTensor dense_x(alloc.get(), + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x.mutable_data(); + + pten::DenseTensor dense_y(alloc.get(), + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y.mutable_data(); + + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0; + } + } + + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(paddle::platform::CPUPlace()); + + std::vector inputs = {dense_x, dense_y}; + + // 2. test API + auto out = pten::Concat( + *(static_cast(dev_ctx)), inputs, 0); + + // 3. check result + ASSERT_EQ(out.dims().size(), 2); + ASSERT_EQ(out.dims()[0], 6); + ASSERT_EQ(out.dims()[1], 10); + ASSERT_EQ(out.meta().dtype, pten::DataType::FLOAT32); + ASSERT_EQ(out.meta().layout, pten::DataLayout::NCHW); + + auto out_data = out.data(); + + for (size_t i = 0; i < 60; ++i) { + if (i < 30) { + ASSERT_NEAR(dense_x_data[i], out_data[i], 1e-6f); + } else { + ASSERT_NEAR(dense_y_data[i - 30], out_data[i], 1e-6f); + } + } +} + +} // namespace tests +} // namespace pten diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 562a726aa29f2..1bf5344e83746 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -18,6 +18,16 @@ param : [x, out_dtype] data_type : x + +- api : concat + args : (const std::vector& x, const Scalar& axis) + output : Tensor + infer_meta : + func : ConcatInferMeta + param : [x, axis, true] + kernel : + func : concat + - api : conj args : (const Tensor& x) output : Tensor diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index e8539b11d1455..c994731585246 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -58,7 +58,10 @@ def parse_args(self, args_str): f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml." args_str = args_str[1:-1] args_list = args_str.split(',') - input_types = ['const Tensor&', 'const Tensor &'] + input_types = [ + 'const Tensor&', 'const Tensor &', 'const std::vector&', + 'const std::vector &' + ] attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \ 'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \ 'const std::vector&', 'Backend', 'DataLayout', 'DataType'] @@ -247,7 +250,7 @@ def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str: param_code = "" for param in infer_meta_params: if param in input_names: - param_code = param_code + self.prefix_tensor_name + param + "->meta(), " + param_code = param_code + "GetDenseTensorMeta(" + self.prefix_tensor_name + param + "), " elif param in attr_names: param_code = param_code + param + ", " elif isinstance(param, str): @@ -267,7 +270,7 @@ def get_kernel_args(self, input_names, attrs, kernel_param): for input_name in input_names: # set input code input_tensor_code = input_tensor_code + f""" - auto {self.prefix_tensor_name}{input_name} = std::dynamic_pointer_cast({input_name}.impl());""" + auto {self.prefix_tensor_name}{input_name} = TensorToDenseTensor({input_name});""" attr_names = attrs['names'] if kernel_param is None: @@ -374,6 +377,35 @@ def api_namespace(): """) +def tensor_to_densetensor(): + return """ + std::shared_ptr TensorToDenseTensor(const Tensor& tensor) { + return std::dynamic_pointer_cast(tensor.impl()); + } + + std::shared_ptr> TensorToDenseTensor(const std::vector& tensors) { + std::vector pt_tensors; + + for(auto & t : tensors) { + pt_tensors.push_back(*std::dynamic_pointer_cast(t.impl())); + } + return std::make_shared>(pt_tensors); + } + + const pten::DenseTensorMeta GetDenseTensorMeta(const std::shared_ptr & x) { + return x->meta(); + } + + const std::vector GetDenseTensorMeta(const std::shared_ptr>& x) { + std::vector metas; + for(auto& t : *x) { + metas.push_back(t.meta()); + } + return metas; + } +""" + + def generate_api(api_yaml_path, header_file_path, source_file_path): with open(api_yaml_path, 'r') as f: @@ -390,6 +422,7 @@ def generate_api(api_yaml_path, header_file_path, source_file_path): include_header_file = "paddle/pten/api/include/api.h" source_file.write(source_include(include_header_file)) source_file.write(namespace[0]) + source_file.write(tensor_to_densetensor()) for api in apis: api_code = API(api) From 854a7ab3589704499a8332b9967011c4457fd507 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Fri, 21 Jan 2022 18:46:24 +0800 Subject: [PATCH 13/15] add pten dependency to infrt (#39079) * add pten dependency to infrt * fix code style * add pten::CPUContext * revert .ignore --- paddle/infrt/CMakeLists.txt | 7 ++++-- paddle/infrt/host_context/value.h | 5 ++++ paddle/infrt/kernel/CMakeLists.txt | 1 + paddle/infrt/kernel/pten_kernels.cc | 37 +++++++++++++++++++++++++++++ paddle/infrt/kernel/pten_kernels.h | 35 +++++++++++++++++++++++++++ paddle/scripts/infrt_build.sh | 0 6 files changed, 83 insertions(+), 2 deletions(-) create mode 100644 paddle/infrt/kernel/pten_kernels.cc create mode 100644 paddle/infrt/kernel/pten_kernels.h mode change 100644 => 100755 paddle/scripts/infrt_build.sh diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 8af3012a220ad..e371e2391829d 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -1,3 +1,6 @@ +#TO DO:remove fluid +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) + if (NOT WITH_INFRT) return() endif() @@ -88,8 +91,8 @@ set(infrt_mlir_incs ) message(STATUS "infrt srcs:\n${infrt_src}") -cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto) -cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto) +cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto pten dense_tensor) +cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto pten dense_tensor) add_dependencies(infrt ${infrt_mlir_incs}) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 4a2b92a7e69c5..7f68e59f8a698 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -29,6 +29,9 @@ #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/dense_tensor.h" + namespace infrt { namespace host_context { @@ -45,6 +48,8 @@ using ValueVariantType = Variant, std::vector, std::vector, diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index da858aad28f81..7e9ed8e5572c0 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -2,6 +2,7 @@ core_gather_headers() gather_srcs(infrt_src SRCS basic_kernels.cc + pten_kernels.cc test_kernels.cc tensor_shape_kernels.cc tensor_kernels.cc diff --git a/paddle/infrt/kernel/pten_kernels.cc b/paddle/infrt/kernel/pten_kernels.cc new file mode 100644 index 0000000000000..70c44b829f774 --- /dev/null +++ b/paddle/infrt/kernel/pten_kernels.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/pten_kernels.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/kernels/math_kernel.h" + +using infrt::host_context::Attribute; + +namespace infrt { +namespace kernel { + +void RegisterPtenKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("pd_cpu.add.float32", + INFRT_KERNEL(pten::AddKernel)); + registry->AddKernel("pd_cpu.add.int32", + INFRT_KERNEL(pten::AddKernel)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/pten_kernels.h b/paddle/infrt/kernel/pten_kernels.h new file mode 100644 index 0000000000000..c290f8ea524fb --- /dev/null +++ b/paddle/infrt/kernel/pten_kernels.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +namespace host_context { + +struct KernelRegistry; + +} // namespace host_context +} // namespace infrt + +namespace infrt { +namespace kernel { + +/** + * Register all the pten kernels to registry. + */ +void RegisterPtenKernels(host_context::KernelRegistry* registry); + +} // namespace kernel +} // namespace infrt diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh old mode 100644 new mode 100755 From a0f586bc626b3fddcc104e46e521e37bc7e4e302 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 21 Jan 2022 20:03:11 +0800 Subject: [PATCH 14/15] [PTen]Separate origin Kernel and add Kernel for C++ API (#39002) * add kernel for c++ api * fix compile bugs * fix kunlun compile bugs * perfect cmake * fix compile bugs when run ci-inference * fix compile bugs * add non-raw kernel for fluid op * fix compile bugs * fix compile bugs * fix unit test bug --- cmake/pten_kernel.cmake | 61 +++-- paddle/fluid/operators/cholesky_solve_op.h | 2 +- .../elementwise/elementwise_add_op.h | 2 +- .../elementwise/elementwise_div_op.h | 2 +- .../elementwise/elementwise_mul_op.cu | 4 +- .../elementwise/elementwise_mul_op.h | 2 +- .../operators/elementwise/elementwise_op.h | 24 +- .../elementwise/elementwise_sub_op.h | 2 +- paddle/fluid/operators/lu_op.h | 4 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 13 +- paddle/pten/api/include/kernel_signature.h | 6 - paddle/pten/core/kernel_alias_name.h | 12 +- paddle/pten/kernels/cpu/math_kernel.cc | 76 +++---- paddle/pten/kernels/gpu/math_kernel.cu | 77 ++++--- paddle/pten/kernels/math_kernel.cc | 212 ++++++++++++++++++ paddle/pten/kernels/math_kernel.h | 125 ++++++----- .../tests/kernels/test_elementwise_dev_api.cc | 12 +- python/paddle/utils/code_gen/api.yaml | 7 +- 18 files changed, 453 insertions(+), 190 deletions(-) create mode 100644 paddle/pten/kernels/math_kernel.cc diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake index bc9fefb58f452..c2928376a02f8 100644 --- a/cmake/pten_kernel.cmake +++ b/cmake/pten_kernel.cmake @@ -103,38 +103,55 @@ function(kernel_library TARGET) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) - if (${common_srcs_len} GREATER 0) - # If the kernel has a device independent public implementation, - # we will use this implementation and will not adopt the implementation - # under specific devices + # Build Target according different src organization + if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR + ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0) + # If the common_srcs depends on specific device srcs, build target using this rule. + if (WITH_GPU) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) + nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + endif() + elseif (WITH_ROCM) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) + hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + endif() + else() + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part) + endif() + endif() + elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) + nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) + hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) + cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + endif() endif() else() - # If the kernel has a header file declaration, but no corresponding - # implementation can be found, this is not allowed - if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND - ${xpu_srcs_len} EQUAL 0) - message(FATAL_ERROR "Cannot find any implementation for ${TARGET}") + if (${common_srcs_len} EQUAL 0) + message(FATAL_ERROR "Cannot find any implementation for ${TARGET}") else() + # If the kernel has a device independent public implementation, + # we will use this implementation and will not adopt the implementation + # under specific devices if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - endif() + endif() endif() if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h index 4b1d075de91ca..5004aad7c59bc 100644 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ b/paddle/fluid/operators/cholesky_solve_op.h @@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { commonterm_for_range(commonterm_functor); commonterm_conj = helper.Transpose(commonterm_conj); - pten::AddKernel( + pten::AddRawKernel( static_cast::TYPE &>(dev_ctx), commonterm, commonterm_conj, -1, &commonterm); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a4897a06d5611..5c4f791b2270c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::AddKernel( + pten::AddRawKernel( static_cast::TYPE &>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 44f695278dca8..a45f09b63e9fe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::DivideKernel( + pten::DivideRawKernel( static_cast::TYPE&>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 86a803106347d..0c7d12ae0ad55 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -51,8 +51,8 @@ class ElementwiseMulKernel auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); - pten::MultiplyKernel(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, - pt_z.get()); + pten::MultiplyRawKernel(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "X's type[%s] is not supported by elementwise_op. X's type should be " diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index d918407930d96..e7a5e48b1f1b5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); - pten::MultiplyKernel( + pten::MultiplyRawKernel( static_cast::TYPE&>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index e1d9655e293a3..aaf33ca674488 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel { framework::KernelSignature GetExpectedPtenKernelArgs( const framework::ExecutionContext &ctx) const override { + int axis = ctx.Attr("axis"); if (Type() == "elementwise_add") { if (ctx.InputVar("X")->IsType()) { - return framework::KernelSignature("add", {"X", "Y"}, {"axis"}, {"Out"}); + if (axis == -1) { + return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"}); + } + return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"}, + {"Out"}); } } if (Type() == "elementwise_sub") { if (ctx.InputVar("X")->IsType()) { - return framework::KernelSignature("subtract", {"X", "Y"}, {"axis"}, + if (axis == -1) { + return framework::KernelSignature("subtract", {"X", "Y"}, {}, + {"Out"}); + } + return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"}, {"Out"}); } } if (Type() == "elementwise_div") { if (ctx.InputVar("X")->IsType()) { - return framework::KernelSignature("divide", {"X", "Y"}, {"axis"}, + if (axis == -1) { + return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"}); + } + return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"}, {"Out"}); } } if (Type() == "elementwise_mul") { if (ctx.InputVar("X")->IsType()) { - return framework::KernelSignature("multiply", {"X", "Y"}, {"axis"}, + if (axis == -1) { + return framework::KernelSignature("multiply", {"X", "Y"}, {}, + {"Out"}); + } + return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"}, {"Out"}); } } diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 46d4a93e804f5..7d1749f20abf2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::SubtractKernel( + pten::SubtractRawKernel( static_cast::TYPE&>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 6beef1add8e4c..c3b3552ba1329 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1, out->Resize(src1.dims()); out->mutable_data(dev_ctx.GetPlace()); - pten::AddKernel< + pten::AddRawKernel< T, typename paddle::framework::ConvertToPtenContext::TYPE>( static_cast::TYPE&>(dev_ctx), @@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1, out->Resize(src1.dims()); out->mutable_data(dev_ctx.GetPlace()); - pten::SubtractKernel< + pten::SubtractRawKernel< T, typename paddle::framework::ConvertToPtenContext::TYPE>( static_cast::TYPE&>(dev_ctx), diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index e2002856a4d08..2e5bd7a42b1d1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel { framework::KernelSignature GetExpectedPtenKernelArgs( const framework::ExecutionContext& ctx) const override { + bool reduce_all = ctx.Attr("reduce_all"); if (Type() == "reduce_sum") { if (ctx.InputVar("X")->IsType()) { + if (!reduce_all) { + return framework::KernelSignature( + "sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"}); + } return framework::KernelSignature( - "sum", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"}, + "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"}, {"Out"}); } } if (Type() == "reduce_mean") { if (ctx.InputVar("X")->IsType()) { + if (!reduce_all) { + return framework::KernelSignature("mean", {"X"}, {"dim", "keep_dim"}, + {"Out"}); + } return framework::KernelSignature( - "mean", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); } } // TODO(chentianyu03): support other cases after selected rows added diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h index e3929d59159c1..d750b47ef864b 100644 --- a/paddle/pten/api/include/kernel_signature.h +++ b/paddle/pten/api/include/kernel_signature.h @@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext; using add_kernel = void (*)(const DeviceContext&, const DenseTensor&, const DenseTensor&, - int, DenseTensor*); using cast_kernel = void (*)(const DeviceContext&, @@ -46,7 +45,6 @@ using concat_kernel = void (*)(const DeviceContext&, using divide_kernel = void (*)(const DeviceContext&, const DenseTensor&, const DenseTensor&, - int, DenseTensor*); using dot_kernel = void (*)(const DeviceContext&, @@ -82,13 +80,11 @@ using mean_kernel = void (*)(const DeviceContext&, const DenseTensor&, const std::vector&, bool, - bool, DenseTensor*); using multiply_kernel = void (*)(const DeviceContext&, const DenseTensor&, const DenseTensor&, - int, DenseTensor*); using reshape_kernel = void (*)(const DeviceContext&, @@ -107,14 +103,12 @@ using sum_kernel = void (*)(const DeviceContext&, const DenseTensor&, const std::vector&, bool, - bool, DataType, DenseTensor*); using subtract_kernel = void (*)(const DeviceContext&, const DenseTensor&, const DenseTensor&, - int, DenseTensor*); using conj_kernel = void (*)(const DeviceContext&, diff --git a/paddle/pten/core/kernel_alias_name.h b/paddle/pten/core/kernel_alias_name.h index 5c86787966368..8e089970f9139 100644 --- a/paddle/pten/core/kernel_alias_name.h +++ b/paddle/pten/core/kernel_alias_name.h @@ -20,10 +20,10 @@ namespace pten { // the key is kernel_name in fluid, the value is the kernel_name in pten // the key is sorted by key's alphabet const std::unordered_map kernel_alias_name_map = { - {"elementwise_add", "add"}, - {"elementwise_div", "divide"}, - {"elementwise_mul", "muliply"}, - {"elementwise_sub", "subtract"}, + {"elementwise_add", "add_raw"}, + {"elementwise_div", "divide_raw"}, + {"elementwise_mul", "muliply_raw"}, + {"elementwise_sub", "subtract_raw"}, {"fill_any_like", "full_like"}, {"fill_constant", "full"}, {"flatten_contiguous_range", "flatten"}, @@ -32,8 +32,8 @@ const std::unordered_map kernel_alias_name_map = { {"matmul_v2_grad", "matmul_grad"}, {"matmul_v2_grad_grad", "matmul_double_grad"}, {"matmul_v2_triple_grad", "matmul_triple_grad"}, - {"reduce_mean", "mean"}, - {"reduce_sum", "sum"}, + {"reduce_mean", "mean_raw"}, + {"reduce_sum", "sum_raw"}, {"reshape2", "reshape"}, {"reshape2_grad", "reshape_grad"}, {"reshape2_grad_grad", "reshape_double_grad"}, diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 7841dd4113cff..706a40936a393 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -32,11 +32,11 @@ namespace pten { #define DEFINE_CPU_ELEMENTWISE_OP(name) \ template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ out->mutable_data(); \ if (x.dims() == y.dims()) { \ SameDimsElementwiseCompute>()( \ @@ -55,23 +55,35 @@ namespace pten { } template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { auto out_dtype = x.dtype(); pten::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); } template -void DivideKernel(const Context& dev_ctx, +void SumRawKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& y, - int axis, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, DenseTensor* out) { + pten::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { // allocate memory for out out->mutable_data(); if (x.dims() == y.dims() && std::is_floating_point::value) { @@ -90,18 +102,6 @@ void DivideKernel(const Context& dev_ctx, } } -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - pten::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - // Create the definition of Add DEFINE_CPU_ELEMENTWISE_OP(Add) @@ -118,42 +118,40 @@ using complex128 = ::paddle::platform::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // using bfloat16 = ::paddle::platform::bfloat16; -PT_REGISTER_KERNEL( - mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {} -PT_REGISTER_KERNEL(add, +PT_REGISTER_KERNEL(add_raw, CPU, ALL_LAYOUT, - pten::AddKernel, + pten::AddRawKernel, float, double, int, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(subtract, +PT_REGISTER_KERNEL(subtract_raw, CPU, ALL_LAYOUT, - pten::SubtractKernel, + pten::SubtractRawKernel, float, double, int, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(divide, +PT_REGISTER_KERNEL(divide_raw, CPU, ALL_LAYOUT, - pten::DivideKernel, + pten::DivideRawKernel, float, double, int, int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL(multiply, +PT_REGISTER_KERNEL(multiply_raw, CPU, ALL_LAYOUT, - pten::MultiplyKernel, + pten::MultiplyRawKernel, float, double, int, @@ -161,10 +159,10 @@ PT_REGISTER_KERNEL(multiply, bool, complex64, complex128) {} -PT_REGISTER_KERNEL(sum, +PT_REGISTER_KERNEL(sum_raw, CPU, ALL_LAYOUT, - pten::SumKernel, + pten::SumRawKernel, bool, float, double, @@ -175,3 +173,5 @@ PT_REGISTER_KERNEL(sum, complex128) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } +PT_REGISTER_KERNEL( + mean_raw, CPU, ALL_LAYOUT, pten::MeanRawKernel, float, double, bool) {} diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index d7a16ac49b1c9..6b6383f81065b 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -37,11 +37,11 @@ namespace pten { #define DEFINE_CUDA_ELEMENTWISE_OP(name) \ template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ + void name##RawKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ std::vector inputs; \ std::vector outputs; \ inputs.emplace_back(&x); \ @@ -57,17 +57,29 @@ namespace pten { */ template -void MeanKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { auto out_dtype = x.dtype(); pten::Reduce( dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); } +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + pten::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + // Create the definition of Add DEFINE_CUDA_ELEMENTWISE_OP(Add) // Create the definition of Subtract @@ -77,30 +89,16 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply) // Create the definition of Divide DEFINE_CUDA_ELEMENTWISE_OP(Divide) -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - pten::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - } // namespace pten using float16 = paddle::platform::float16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL( - mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {} -PT_REGISTER_KERNEL(add, +PT_REGISTER_KERNEL(add_raw, GPU, ALL_LAYOUT, - pten::AddKernel, + pten::AddRawKernel, float, double, int, @@ -108,10 +106,10 @@ PT_REGISTER_KERNEL(add, float16, complex64, complex128) {} -PT_REGISTER_KERNEL(subtract, +PT_REGISTER_KERNEL(subtract_raw, GPU, ALL_LAYOUT, - pten::SubtractKernel, + pten::SubtractRawKernel, float, double, int, @@ -119,10 +117,10 @@ PT_REGISTER_KERNEL(subtract, float16, complex64, complex128) {} -PT_REGISTER_KERNEL(divide, +PT_REGISTER_KERNEL(divide_raw, GPU, ALL_LAYOUT, - pten::DivideKernel, + pten::DivideRawKernel, float, double, int, @@ -130,10 +128,10 @@ PT_REGISTER_KERNEL(divide, float16, complex64, complex128) {} -PT_REGISTER_KERNEL(multiply, +PT_REGISTER_KERNEL(multiply_raw, GPU, ALL_LAYOUT, - pten::MultiplyKernel, + pten::MultiplyRawKernel, float, double, int, @@ -142,10 +140,10 @@ PT_REGISTER_KERNEL(multiply, float16, complex64, complex128) {} -PT_REGISTER_KERNEL(sum, +PT_REGISTER_KERNEL(sum_raw, GPU, ALL_LAYOUT, - pten::SumKernel, + pten::SumRawKernel, bool, float, double, @@ -156,3 +154,12 @@ PT_REGISTER_KERNEL(sum, complex128) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } + +PT_REGISTER_KERNEL(mean_raw, + GPU, + ALL_LAYOUT, + pten::MeanRawKernel, + float, + double, + bool, + float16) {} diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc new file mode 100644 index 0000000000000..423282ab97ca4 --- /dev/null +++ b/paddle/pten/kernels/math_kernel.cc @@ -0,0 +1,212 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/math_kernel.h" + +#include "paddle/pten/backends/all_context.h" +#include "paddle/pten/core/kernel_registry.h" + +namespace pten { + +template +void MeanKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MeanRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DataType out_dtype, + DenseTensor* out) { + bool reduce_all = false; + SumRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out); +} + +template +void AddKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + AddRawKernel(dev_ctx, x, y, axis, out); +} + +template +void SubtractKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + SubtractRawKernel(dev_ctx, x, y, axis, out); +} + +template +void DivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + DivideRawKernel(dev_ctx, x, y, axis, out); +} + +template +void MultiplyKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + int axis = -1; + MultiplyRawKernel(dev_ctx, x, y, axis, out); +} + +} // namespace pten + +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + +PT_REGISTER_KERNEL( + mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {} + +PT_REGISTER_KERNEL(sum, + CPU, + ALL_LAYOUT, + pten::SumKernel, + bool, + float, + double, + paddle::platform::float16, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} + +PT_REGISTER_KERNEL(add, + CPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(subtract, + CPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(divide, + CPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_KERNEL(multiply, + CPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + complex64, + complex128) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PT_REGISTER_KERNEL(mean, + GPU, + ALL_LAYOUT, + pten::MeanKernel, + float, + double, + bool, + paddle::platform::float16) {} +PT_REGISTER_KERNEL(sum, + GPU, + ALL_LAYOUT, + pten::SumKernel, + bool, + float, + double, + paddle::platform::float16, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} +PT_REGISTER_KERNEL(add, + GPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(subtract, + GPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(divide, + GPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + paddle::platform::float16, + complex64, + complex128) {} +PT_REGISTER_KERNEL(multiply, + GPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + paddle::platform::float16, + complex64, + complex128) {} +#endif diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h index 65c0f84e696de..95379baaf3504 100644 --- a/paddle/pten/kernels/math_kernel.h +++ b/paddle/pten/kernels/math_kernel.h @@ -22,104 +22,127 @@ limitations under the License. */ namespace pten { +template +void MeanRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + template void MeanKernel(const Context& dev_ctx, const DenseTensor& x, const std::vector& dims, bool keep_dim, - bool reduce_all, DenseTensor* out); +template +void SumRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +void SumKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DataType out_dtype, + DenseTensor* out); + +template +void AddRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + template void AddKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - int axis, DenseTensor* out); +template +void SubtractRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + template void SubtractKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - int axis, DenseTensor* out); +template +void DivideRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + template void DivideKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - int axis, DenseTensor* out); +template +void MultiplyRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + template void MultiplyKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& y, - int axis, DenseTensor* out); -template -void SumKernel(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - template DenseTensor Add(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - AddKernel(dev_ctx, x, y, axis, &dense_out); + const DenseTensor& y) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + AddKernel(dev_ctx, x, y, &dense_out); return dense_out; } template DenseTensor Subtract(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - SubtractKernel(dev_ctx, x, y, axis, &dense_out); + const DenseTensor& y) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + SubtractKernel(dev_ctx, x, y, &dense_out); return dense_out; } template DenseTensor Divide(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - DivideKernel(dev_ctx, x, y, axis, &dense_out); + const DenseTensor& y) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + DivideKernel(dev_ctx, x, y, &dense_out); return dense_out; } template DenseTensor Multiply(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - MultiplyKernel(dev_ctx, x, y, axis, &dense_out); + const DenseTensor& y) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1); + auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); + MultiplyKernel(dev_ctx, x, y, &dense_out); return dense_out; } @@ -130,8 +153,7 @@ DenseTensor Mean(const Context& dev_ctx, bool keep_dim) { auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); - bool reduce_all = false; - MeanKernel(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out); + MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); return dense_out; } @@ -144,12 +166,7 @@ DenseTensor Sum(const Context& dev_ctx, auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype); auto dense_out = pten::Empty(dev_ctx, std::move(out_meta)); - // The real value of reduce_all will be get in kernel - // so use default value(false) is OK. - bool reduce_all = false; - - SumKernel( - dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out); + SumKernel(dev_ctx, x, axis, keep_dim, dtype, &dense_out); return dense_out; } diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index 0bc16371c0731..e5d9b05eec7b3 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -54,11 +54,10 @@ TEST(DEV_API, add) { for (size_t i = 0; i < 10; ++i) { dense_y_data[i] = i * 2.0; } - int axis = 1; // 2. test API pten::CPUContext dev_ctx; - auto dense_out = pten::Add(dev_ctx, dense_x, dense_y, axis); + auto dense_out = pten::Add(dev_ctx, dense_x, dense_y); // 3. check result ASSERT_EQ(dense_out.dims().size(), 2); @@ -101,11 +100,10 @@ TEST(DEV_API, subtract) { for (size_t i = 0; i < 10; ++i) { dense_y_data[i] = i * 2.0; } - int axis = 1; // 2. test API pten::CPUContext dev_ctx; - auto dense_out = pten::Subtract(dev_ctx, dense_x, dense_y, axis); + auto dense_out = pten::Subtract(dev_ctx, dense_x, dense_y); // 3. check result ASSERT_EQ(dense_out.dims().size(), 2); @@ -148,11 +146,10 @@ TEST(DEV_API, divide) { for (size_t i = 0; i < 10; ++i) { dense_y_data[i] = i * 2.0 + 1; } - int axis = 1; // 2. test API pten::CPUContext dev_ctx; - auto dense_out = pten::Divide(dev_ctx, dense_x, dense_y, axis); + auto dense_out = pten::Divide(dev_ctx, dense_x, dense_y); // 3. check result ASSERT_EQ(dense_out.dims().size(), 2); @@ -195,11 +192,10 @@ TEST(DEV_API, multiply) { for (size_t i = 0; i < 10; ++i) { dense_y_data[i] = i * 2.0; } - int axis = 1; // 2. test API pten::CPUContext dev_ctx; - auto dense_out = pten::Multiply(dev_ctx, dense_x, dense_y, axis); + auto dense_out = pten::Multiply(dev_ctx, dense_x, dense_y); // 3. check result ASSERT_EQ(dense_out.dims().size(), 2); diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 1bf5344e83746..a0d7ce84f75fd 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -6,7 +6,6 @@ param : [x, y, -1] kernel : func : add - param : [x, y, -1] - api : cast args : (const Tensor& x, DataType out_dtype) @@ -44,7 +43,6 @@ param : [x, y, -1] kernel : func : divide - param : [x, y, -1] - api : dot args : (const Tensor& x, const Tensor& y) @@ -130,7 +128,6 @@ param: [x, axis, keep_dim] kernel : func : mean - param : [x, axis, keep_dim, false] - api : multiply args : (const Tensor& x, const Tensor& y) @@ -140,7 +137,6 @@ param : [x, y, -1] kernel : func : multiply - param : [x, y, -1] - api : ones_like args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) @@ -172,7 +168,6 @@ param : [x, y, -1] kernel : func : subtract - param : [x, y, -1] - api : sum args : (const Tensor& x, const std::vector& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) @@ -182,7 +177,7 @@ param: [x, axis, keep_dim, dtype] kernel : func : sum - param : [x, axis, keep_dim, false, DataType::UNDEFINED] + param : [x, axis, keep_dim, dtype] data_type : x - api : zeros_like From a14dc68820dbb221831b13b8c43155f537e265e9 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Fri, 21 Jan 2022 20:56:04 +0800 Subject: [PATCH 15/15] [pten] fix test concat dev api build failed (#39117) * fix test concat dev api build failed * fix conflict * fix conflict --- paddle/fluid/operators/concat_op.h | 5 ++++- paddle/pten/kernels/cpu/concat_kernel.cc | 2 +- paddle/pten/kernels/gpu/concat_kernel.cu | 2 +- paddle/pten/tests/api/test_concat_api.cc | 6 ++++-- paddle/pten/tests/kernels/test_concat_dev_api.cc | 16 +++++++--------- 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 3eaffbdc8bf35..1d9c10bdb8cc6 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -80,7 +80,10 @@ class ConcatKernel : public framework::OpKernel { pt_ins.push_back(*in); } - pten::ConcatKernel(dev_ctx, pt_ins, axis, out); + pten::ConcatKernel( + static_cast::TYPE&>(dev_ctx), + pt_ins, axis, out); } }; diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc index fb59c9c6005ff..c4aed7679bd72 100644 --- a/paddle/pten/kernels/cpu/concat_kernel.cc +++ b/paddle/pten/kernels/cpu/concat_kernel.cc @@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx, pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis); out->Resize(out_dims); - out->mutable_data(); + out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. if (axis == 0 && x[0].lod().size() > 0) { diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu index 6ddfef460fc6c..e52e3a3d6446c 100644 --- a/paddle/pten/kernels/gpu/concat_kernel.cu +++ b/paddle/pten/kernels/gpu/concat_kernel.cu @@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx, pten::DDim out_dims = pten::funcs::ComputeAndCheckShape(true, x_dims, axis); out->Resize(out_dims); - out->mutable_data(); + out->mutable_data(dev_ctx.GetPlace()); // If axis is 0, the lod of the output is not the same as inputs. if (axis == 0 && x[0].lod().size() > 0) { diff --git a/paddle/pten/tests/api/test_concat_api.cc b/paddle/pten/tests/api/test_concat_api.cc index e84aee0aaaf4f..c003e89f6c009 100644 --- a/paddle/pten/tests/api/test_concat_api.cc +++ b/paddle/pten/tests/api/test_concat_api.cc @@ -37,14 +37,16 @@ TEST(API, concat) { pten::DenseTensorMeta(pten::DataType::FLOAT32, framework::make_ddim({3, 10}), pten::DataLayout::NCHW)); - auto* dense_x_data = dense_x->mutable_data(); + auto* dense_x_data = + dense_x->mutable_data(paddle::platform::CPUPlace()); auto dense_y = std::make_shared( alloc.get(), pten::DenseTensorMeta(pten::DataType::FLOAT32, framework::make_ddim({3, 10}), pten::DataLayout::NCHW)); - auto* dense_y_data = dense_y->mutable_data(); + auto* dense_y_data = + dense_y->mutable_data(paddle::platform::CPUPlace()); for (size_t i = 0; i < 3; ++i) { for (size_t j = 0; j < 10; ++j) { diff --git a/paddle/pten/tests/kernels/test_concat_dev_api.cc b/paddle/pten/tests/kernels/test_concat_dev_api.cc index c5d979ad908ff..6f9ea1b0d990a 100644 --- a/paddle/pten/tests/kernels/test_concat_dev_api.cc +++ b/paddle/pten/tests/kernels/test_concat_dev_api.cc @@ -25,7 +25,7 @@ namespace pten { namespace tests { namespace framework = paddle::framework; -using DDim = paddle::framework::DDim; +using DDim = pten::framework::DDim; TEST(DEV_API, concat) { // 1. create tensor @@ -35,13 +35,15 @@ TEST(DEV_API, concat) { pten::DenseTensorMeta(pten::DataType::FLOAT32, framework::make_ddim({3, 10}), pten::DataLayout::NCHW)); - auto* dense_x_data = dense_x.mutable_data(); + auto* dense_x_data = + dense_x.mutable_data(paddle::platform::CPUPlace()); pten::DenseTensor dense_y(alloc.get(), pten::DenseTensorMeta(pten::DataType::FLOAT32, framework::make_ddim({3, 10}), pten::DataLayout::NCHW)); - auto* dense_y_data = dense_y.mutable_data(); + auto* dense_y_data = + dense_y.mutable_data(paddle::platform::CPUPlace()); for (size_t i = 0; i < 3; ++i) { for (size_t j = 0; j < 10; ++j) { @@ -50,15 +52,11 @@ TEST(DEV_API, concat) { } } - paddle::platform::DeviceContextPool& pool = - paddle::platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(paddle::platform::CPUPlace()); - std::vector inputs = {dense_x, dense_y}; // 2. test API - auto out = pten::Concat( - *(static_cast(dev_ctx)), inputs, 0); + pten::CPUContext dev_ctx; + auto out = pten::Concat(dev_ctx, inputs, 0); // 3. check result ASSERT_EQ(out.dims().size(), 2);