From f138371cbf3df997d4750cb6c1233e76d613fdea Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Wed, 16 Feb 2022 17:18:13 +0800 Subject: [PATCH 01/19] [MLU] support adative pooling (#39500) --- paddle/fluid/operators/mlu/mlu_baseop.cc | 23 ++ paddle/fluid/operators/mlu/mlu_baseop.h | 12 ++ paddle/fluid/operators/pool_op_mlu.cc | 151 ++++++++----- .../tests/unittests/mlu/test_pool2d_op_mlu.py | 200 +++++++++++++++--- 4 files changed, 309 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 068b31a6b7d21..82ea75943dee4 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -1151,6 +1151,18 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output, workspace_ptr, workspace_size)); } +/* static */ void MLUCnnl::AdaptivePoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlTensorDescriptor_t index_desc, void* index) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlAdaptivePoolingForward(handle, input_desc, input, pool_mode, + output_desc, output, index_desc, index)); +} + /* static */ void MLUCnnl::Pool3D( const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, const std::vector& output_shape, @@ -1802,6 +1814,17 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { y, diff_y_desc, diff_y, x_desc, x, beta, diff_x_desc, diff_x)); } +/* static */ void MLUCnnl::AdaptivePoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t index_desc, const void* index, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x) { + cnnlHandle_t handle = GetHandleFromCTX(ctx); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAdaptivePoolingBackward( + handle, y_desc, y, index_desc, index, pool_mode, diff_x_desc, diff_x)); +} + /* static */ void MLUCnnl::NonMaxSuppression( const ExecutionContext& ctx, const cnnlNmsDescriptor_t nms_desc, const cnnlTensorDescriptor_t boxes_desc, const void* boxes, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index ad912c034683f..91eddaf792e8a 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -649,6 +649,12 @@ class MLUCnnl { const void* input, const void* beta, const void* extra_input_ptr, const cnnlTensorDescriptor_t output_desc, void* output); + static void AdaptivePoolingForward( + const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t input_desc, const void* input, + const cnnlTensorDescriptor_t output_desc, void* output, + const cnnlTensorDescriptor_t index_desc, void* index); + static void Pool3D(const ExecutionContext& ctx, cnnlPoolingMode_t pool_mode, const std::vector& output_shape, cnnlPoolingDescriptor_t pooling_desc, const void* alpha, @@ -958,6 +964,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t x_desc, const void* x, const void* beta, const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void AdaptivePoolingBackward( + const ExecutionContext& ctx, const cnnlPoolingMode_t pool_mode, + const cnnlTensorDescriptor_t y_desc, const void* y, + const cnnlTensorDescriptor_t index_desc, const void* index, + const cnnlTensorDescriptor_t diff_x_desc, void* diff_x); + static void PoolingIndex(const ExecutionContext& ctx, const cnnlPoolingDescriptor_t pooling_desc, const cnnlTensorDescriptor_t x_desc, const void* x, diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index a64a9c274ed7d..1bbd671323e6d 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -21,12 +21,12 @@ namespace operators { namespace { cnnlPoolingMode_t ToCnnlPoolingMode(const std::string &pooling_type, - bool exclusive) { + bool exclusive, bool adaptive) { cnnlPoolingMode_t pooling_mode; if (pooling_type == "max") { pooling_mode = CNNL_POOLING_MAX; } else if (pooling_type == "avg") { - if (exclusive) { + if (exclusive && !adaptive) { pooling_mode = CNNL_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; } else { pooling_mode = CNNL_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; @@ -64,10 +64,7 @@ class MLUPoolOpKernel : public framework::OpKernel { platform::errors::InvalidArgument( "Only support 4-dims for mlu pool2d kernel.")); - PADDLE_ENFORCE_EQ(adaptive, false, - platform::errors::InvalidArgument( - "Not support adaptive for mlu pool2d kernel.")); - + const bool channel_last = data_format == "NHWC"; // default cnnlTensorLayout_t cnnl_layout = CNNL_LAYOUT_NCHW; auto out_dims = out->dims(); @@ -77,7 +74,6 @@ class MLUPoolOpKernel : public framework::OpKernel { framework::DDim data_dims = framework::slice_ddim(in_x_dims, 2, in_x_dims.size()); - const bool channel_last = data_format == "NHWC"; if (channel_last) { cnnl_layout = CNNL_LAYOUT_NHWC; out_h = out_dims[1]; @@ -94,42 +90,74 @@ class MLUPoolOpKernel : public framework::OpKernel { MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); MLUCnnlTensorDesc out_desc(*out, cnnl_layout, ToCnnlDataType()); - cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive); - MLUCnnlPoolingDesc pool_desc( - pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], - paddings[1], paddings[2], paddings[3], strides[0], strides[1], - 1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode); + cnnlPoolingMode_t pool_mode = + ToCnnlPoolingMode(pooling_type, exclusive, adaptive); + + if (!adaptive) { + MLUCnnlPoolingDesc pool_desc( + pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], + paddings[1], paddings[2], paddings[3], strides[0], strides[1], + 1 /*row_dilation*/, 1 /*col_dilation*/, ceil_mode); + + size_t extra_input_size = 0; + cnnlHandle_t handle = + ctx.template device_context().cnnl_handle(); + cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h, + &extra_input_size); - size_t extra_input_size = 0; - cnnlHandle_t handle = - ctx.template device_context().cnnl_handle(); - cnnlGetPoolingExtraInputSize(handle, pool_mode, out_w, out_h, - &extra_input_size); - - if (extra_input_size > 0) { - paddle::platform::CPUDeviceContext cpu_ctx; - framework::Tensor extra_host_tensor = - ctx.AllocateTmpTensor( - {static_cast(extra_input_size)}, cpu_ctx); - cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(), - out_desc.get(), GetBasePtr(&extra_host_tensor)); - framework::Tensor extra_device_tensor = - ctx.AllocateTmpTensor( - {static_cast(extra_input_size)}, dev_ctx); - // TODO(fwg): use Async copy, and add a callback to stream that free host - // memory. - framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), - &extra_device_tensor); - MLUCnnl::PoolingForward( - ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, - in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, - GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, out_desc.get(), - GetBasePtr(out)); + if (extra_input_size > 0) { + paddle::platform::CPUDeviceContext cpu_ctx; + framework::Tensor extra_host_tensor = + ctx.AllocateTmpTensor( + {static_cast(extra_input_size)}, cpu_ctx); + cnnlInitPoolingExtraInput(handle, pool_desc.get(), in_x_desc.get(), + out_desc.get(), + GetBasePtr(&extra_host_tensor)); + framework::Tensor extra_device_tensor = + ctx.AllocateTmpTensor( + {static_cast(extra_input_size)}, dev_ctx); + // TODO(fwg): use Async copy, and add a callback to stream that free + // host + // memory. + framework::TensorCopySync(extra_host_tensor, ctx.GetPlace(), + &extra_device_tensor); + MLUCnnl::PoolingForward( + ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, + in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, + GetBasePtr(&extra_device_tensor) /*params_shape_ptr*/, + out_desc.get(), GetBasePtr(out)); + } else { + MLUCnnl::PoolingForward( + ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, + in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, + nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out)); + } } else { - MLUCnnl::PoolingForward( - ctx, pool_mode, out_h, out_w, pool_desc.get(), nullptr /*alpha*/, - in_x_desc.get(), GetBasePtr(in_x), nullptr /*beta*/, - nullptr /*params_shape_ptr*/, out_desc.get(), GetBasePtr(out)); + // cnnl Adaptive pooling only support NHWC layout + framework::Tensor trans_in_x; + framework::Tensor trans_out; + if (channel_last) { + trans_in_x = *in_x; + trans_out = *out; + } else { + std::vector perm{0, 2, 3, 1}; + TransposeFromMLUTensor(ctx, perm, in_x, &trans_in_x, + true /*need_reshape_or_alloc*/); + trans_out = ctx.AllocateTmpTensor( + {out_dims[0], out_dims[2], out_dims[3], out_dims[1]}, dev_ctx); + } + MLUCnnlTensorDesc trans_in_x_desc(trans_in_x, CNNL_LAYOUT_NHWC, + ToCnnlDataType()); + MLUCnnlTensorDesc trans_out_desc(trans_out, CNNL_LAYOUT_NHWC, + ToCnnlDataType()); + MLUCnnl::AdaptivePoolingForward( + ctx, pool_mode, trans_in_x_desc.get(), GetBasePtr(&trans_in_x), + trans_out_desc.get(), GetBasePtr(&trans_out), nullptr, nullptr); + if (!channel_last) { + std::vector perm{0, 3, 1, 2}; + TransposeFromMLUTensor(ctx, perm, &trans_out, out, + false /*need_reshape_or_alloc*/); + } } } }; @@ -204,7 +232,8 @@ class MLUPoolGradOpKernel : public framework::OpKernel { MLUCnnlTensorDesc trans_in_x_grad_desc(trans_in_x_grad, CNNL_LAYOUT_NHWC, ToCnnlDataType()); - cnnlPoolingMode_t pool_mode = ToCnnlPoolingMode(pooling_type, exclusive); + cnnlPoolingMode_t pool_mode = + ToCnnlPoolingMode(pooling_type, exclusive, adaptive); MLUCnnlPoolingDesc pool_desc( pool_mode, CNNL_NOT_PROPAGATE_NAN, ksize[0], ksize[1], paddings[0], paddings[1], paddings[2], paddings[3], strides[0], strides[1], @@ -219,18 +248,34 @@ class MLUPoolGradOpKernel : public framework::OpKernel { MLUCnnl::PoolingIndex(ctx, pool_desc.get(), trans_in_x_desc.get(), GetBasePtr(&trans_in_x), index_tensor_desc.get(), GetBasePtr(&index_tensor)); - MLUCnnl::PoolingBackward( - ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(), - GetBasePtr(&index_tensor), trans_out_grad_desc.get(), - GetBasePtr(&trans_out_grad), trans_in_x_desc.get(), - GetBasePtr(&trans_in_x), nullptr /*beta*/, trans_in_x_grad_desc.get(), - GetBasePtr(&trans_in_x_grad)); + if (adaptive) { + MLUCnnl::AdaptivePoolingBackward( + ctx, pool_mode, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), index_tensor_desc.get(), + GetBasePtr(&index_tensor), trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } else { + MLUCnnl::PoolingBackward( + ctx, pool_desc.get(), nullptr /*alpha*/, index_tensor_desc.get(), + GetBasePtr(&index_tensor), trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), trans_in_x_desc.get(), + GetBasePtr(&trans_in_x), nullptr /*beta*/, + trans_in_x_grad_desc.get(), GetBasePtr(&trans_in_x_grad)); + } } else { - MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/, nullptr, - nullptr, trans_out_grad_desc.get(), - GetBasePtr(&trans_out_grad), nullptr, nullptr, - nullptr /*beta*/, trans_in_x_grad_desc.get(), - GetBasePtr(&trans_in_x_grad)); + if (adaptive) { + MLUCnnl::AdaptivePoolingBackward( + ctx, pool_mode, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), nullptr /*index_tensor_desc.get()*/, + nullptr /*GetBasePtr(&index_tensor)*/, trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } else { + MLUCnnl::PoolingBackward(ctx, pool_desc.get(), nullptr /*alpha*/, + nullptr, nullptr, trans_out_grad_desc.get(), + GetBasePtr(&trans_out_grad), nullptr, nullptr, + nullptr /*beta*/, trans_in_x_grad_desc.get(), + GetBasePtr(&trans_in_x_grad)); + } } if (!channel_last) { std::vector perm{0, 3, 1, 2}; diff --git a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py index 2d9703117671c..fd442c6205e98 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_pool2d_op_mlu.py @@ -25,7 +25,125 @@ import sys sys.path.append('..') from op_test import OpTest -from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive +from test_pool2d_op import pool2D_forward_naive, avg_pool2D_forward_naive, max_pool2D_forward_naive, adaptive_start_index, adaptive_end_index + + +def pool2d_backward_navie(x, + ksize, + strides, + paddings, + global_pool=0, + ceil_mode=False, + exclusive=True, + adaptive=False, + data_format='NCHW', + pool_type="max", + padding_algorithm="EXPLICIT"): + # update paddings + def _get_padding_with_SAME(input_shape, pool_size, pool_stride): + padding = [] + for input_size, filter_size, stride_size in zip(input_shape, pool_size, + pool_stride): + out_size = int((input_size + stride_size - 1) / stride_size) + pad_sum = np.max(( + (out_size - 1) * stride_size + filter_size - input_size, 0)) + pad_0 = int(pad_sum / 2) + pad_1 = int(pad_sum - pad_0) + padding.append(pad_0) + padding.append(pad_1) + return padding + + if isinstance(padding_algorithm, str): + padding_algorithm = padding_algorithm.upper() + if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: + raise ValueError("Unknown Attr(padding_algorithm): '%s'. " + "It can only be 'SAME' or 'VALID'." % + str(padding_algorithm)) + + if padding_algorithm == "VALID": + paddings = [0, 0, 0, 0] + if ceil_mode != False: + raise ValueError( + "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode)" + " must be False. " + "Received ceil_mode: True.") + elif padding_algorithm == "SAME": + input_data_shape = [] + if data_format == "NCHW": + input_data_shape = x.shape[2:4] + elif data_format == "NHWC": + input_data_shape = x.shape[1:3] + paddings = _get_padding_with_SAME(input_data_shape, ksize, strides) + + assert len(paddings) == 2 or len(paddings) == 4 + is_sys = True if len(paddings) == 2 else False + + if data_format == "NHWC": + x = x.transpose([0, 3, 1, 2]) + + N, C, H, W = x.shape + + if global_pool == 1: + ksize = [H, W] + paddings = [0 for _ in range(len(paddings))] + + pad_h_up = paddings[0] if is_sys else paddings[0] + pad_h_down = paddings[0] if is_sys else paddings[1] + pad_w_left = paddings[1] if is_sys else paddings[2] + pad_w_right = paddings[1] if is_sys else paddings[3] + + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + pad_h_up + pad_h_down + strides[0] - 1) // strides[0] + 1 \ + if ceil_mode else (H - ksize[0] + pad_h_up + pad_h_down) // strides[0] + 1 + W_out = (W - ksize[1] + pad_w_left + pad_w_right + strides[1] - 1) // strides[1] + 1 \ + if ceil_mode else (W - ksize[1] + pad_w_left + pad_w_right) // strides[1] + 1 + + x_grad = np.zeros_like(x) + for i in range(H_out): + if adaptive: + in_h_start = adaptive_start_index(i, H, ksize[0]) + in_h_end = adaptive_end_index(i, H, ksize[0]) + else: + in_h_start = np.max((i * strides[0] - pad_h_up, 0)) + in_h_end = np.min((i * strides[0] + ksize[0] - pad_h_up, H)) + + for j in range(W_out): + if adaptive: + in_w_start = adaptive_start_index(j, W, ksize[1]) + in_w_end = adaptive_end_index(j, W, ksize[1]) + else: + in_h_start = i * strides[0] - pad_h_up + in_w_start = j * strides[1] - pad_w_left + in_h_end = i * strides[0] + ksize[0] - pad_h_up + in_w_end = j * strides[1] + ksize[1] - pad_w_left + + field_size = (in_h_end - in_h_start) * (in_w_end - in_w_start) + in_h_start = np.max((in_h_start, 0)) + in_w_start = np.max((in_w_start, 0)) + in_h_end = np.min((in_h_end, H)) + in_w_end = np.min((in_w_end, W)) + + if pool_type == 'avg': + if (exclusive or adaptive): + field_size = (in_h_end - in_h_start) * ( + in_w_end - in_w_start) + x_grad[:, :, in_h_start:in_h_end, in_w_start: + in_w_end] += 1 / field_size + elif pool_type == 'max': + for n in range(N): + for c in range(C): + idx = np.argmax(x[n, c, in_h_start:in_h_end, in_w_start: + in_w_end].flatten()) + idx_h = idx // (in_w_end - in_w_start) + idx_w = idx % (in_w_end - in_w_start) + x_grad[n, c, in_h_start + idx_h, in_w_start + + idx_w] += 1 + + if data_format == "NHWC": + x_grad = x_grad.transpose([0, 2, 3, 1]) + return x_grad class TestPool2D_Op_Mixin(object): @@ -71,12 +189,25 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad(self): - if self.dtype == np.float16: - return - - if self.pool_type != "max": - self.check_grad_with_place( - self.place, set(['X']), 'Out', max_relative_error=0.07) + x_grad = pool2d_backward_navie( + self.inputs["X"], + ksize=self.ksize, + strides=self.strides, + paddings=self.paddings, + global_pool=self.global_pool, + ceil_mode=False, + exclusive=self.exclusive, + adaptive=self.adaptive, + data_format=self.data_format, + pool_type=self.pool_type, + padding_algorithm=self.padding_algorithm) + x_grad = x_grad / np.prod(self.outputs['Out'].shape) + self.check_grad_with_place( + self.place, + set(['X']), + 'Out', + max_relative_error=0.06, + user_defined_grads=[x_grad]) def init_data_format(self): self.data_format = "NCHW" @@ -108,7 +239,6 @@ def init_ceil_mode(self): def init_exclusive(self): self.exclusive = True - # Not support adaptive pooling currently def init_adaptive(self): self.adaptive = False @@ -173,7 +303,7 @@ def init_pool_type(self): self.pool2D_forward_naive = max_pool2D_forward_naive -def create_test_fp16_class(parent, check_grad=True): +def create_test_fp16_class(parent): class TestFp16Case(parent): def init_data_type(self): self.dtype = np.float16 @@ -182,19 +312,13 @@ def test_check_output(self): place = core.MLUPlace(0) self.check_output_with_place(place, atol=1e-3) - def test_check_grad(self): - place = core.MLUPlace(0) - if self.pool_type != "max" and check_grad: - self.check_grad_with_place( - place, set(['X']), 'Out', max_relative_error=0.07) - cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op") TestFp16Case.__name__ = cls_name globals()[cls_name] = TestFp16Case create_test_fp16_class(TestPool2D_Op) -create_test_fp16_class(TestCase1, check_grad=False) +create_test_fp16_class(TestCase1) create_test_fp16_class(TestCase2) create_test_fp16_class(TestCase3) create_test_fp16_class(TestCase4) @@ -222,6 +346,24 @@ def init_exclusive(self): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + +class TestAvgPoolAdaptiveAsyOutSize(TestCase1): + def init_adaptive(self): + self.adaptive = True + + def init_shape(self): + self.shape = [8, 3, 6, 6] + + def init_test_case(self): + self.ksize = [2, 3] + self.strides = [1, 1] + self.paddings = [0, 0, 0, 0] + + #-------test pool2d with asymmetric padding----- @@ -302,6 +444,19 @@ def init_shape(self): self.shape = [2, 3, 7, 7] +class TestAvgPoolAdaptive_AsyPadding(TestCase1): + def init_adaptive(self): + self.adaptive = True + + def init_test_case(self): + self.ksize = [3, 3] + self.strides = [1, 1] + self.paddings = [1, 1, 0, 2] + + def init_shape(self): + self.shape = [2, 3, 7, 7] + + #----------- test channel_last -------------- class TestPool2D_channel_last(TestPool2D_Op): def init_data_format(self): @@ -359,14 +514,6 @@ class TestCase5_Max(TestCase2): def init_pool_type(self): self.pool_type = "max" - def test_check_grad(self): - if self.dtype == np.float16: - return - place = core.MLUPlace(0) - if self.pool_type == "max": - self.check_grad_with_place( - place, set(['X']), 'Out', max_relative_error=1.00) - class TestCase5_channel_last_Max(TestCase5_Max): def init_data_format(self): @@ -381,6 +528,11 @@ def init_exclusive(self): self.exclusive = False +class TestAvgPoolAdaptive_channel_last(TestCase1_channel_last): + def init_adaptive(self): + self.adaptive = True + + class TestPool2D_AsyPadding_channel_last(TestPool2D_AsyPadding): def init_data_format(self): self.data_format = "NHWC" From e254e7c656f9fe3fc5136e51e1972e1753b7a1e2 Mon Sep 17 00:00:00 2001 From: TTerror Date: Wed, 16 Feb 2022 17:21:34 +0800 Subject: [PATCH 02/19] optimize prior_box for kunlun, *test=kunlun (#39477) --- .../operators/detection/prior_box_op_xpu.cc | 17 +- .../unittests/xpu/test_prior_box_op_xpu.py | 342 +++++++++--------- 2 files changed, 182 insertions(+), 177 deletions(-) diff --git a/paddle/fluid/operators/detection/prior_box_op_xpu.cc b/paddle/fluid/operators/detection/prior_box_op_xpu.cc index bab394689546e..c39f702a48644 100644 --- a/paddle/fluid/operators/detection/prior_box_op_xpu.cc +++ b/paddle/fluid/operators/detection/prior_box_op_xpu.cc @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/detection/prior_box_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { @@ -81,21 +82,17 @@ class PriorBoxOpXPUKernel : public framework::OpKernel { dev_ctx.x_context(), boxes_data, aspect_ratios_param, min_sizes_param, max_sizes_param, feature_height, feature_width, img_height, img_width, offset, step_height, step_width, clip, min_max_aspect_ratios_order); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU gen_prior_box kernel return wrong value[%d %s]", - ret, XPUAPIErrorMsg[ret])); + PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gen_prior_box"); int box_num = feature_height * feature_width * num_priors; int vlen = variances.size(); + std::vector var_cpu(vlen * box_num); for (int i = 0; i < box_num; ++i) { - ret = xpu_memcpy(vars_data + i * vlen, variances.data(), vlen * sizeof(K), - XPUMemcpyKind::XPU_HOST_TO_DEVICE); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External( - "XPU xpu_memcpy return wrong " - "value[%d %s] in prior_box.", - ret, XPUAPIErrorMsg[ret])); + std::copy(variances.begin(), variances.end(), var_cpu.begin() + i * vlen); } + ret = xpu_memcpy(vars_data, var_cpu.data(), var_cpu.size() * sizeof(K), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_XPU_SUCCESS(ret); } }; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py index 44137f4718743..0830237d5a89d 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_prior_box_op_xpu.py @@ -14,188 +14,196 @@ from __future__ import print_function -import unittest +import math import numpy as np import sys +import unittest sys.path.append("..") -import math + import paddle -from op_test import OpTest + from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestPriorBoxOp(XPUOpTest): - def set_data(self): - self.init_test_params() - self.init_test_input() - self.init_test_output() - self.inputs = {'Input': self.input, 'Image': self.image} - - self.attrs = { - 'min_sizes': self.min_sizes, - 'aspect_ratios': self.aspect_ratios, - 'variances': self.variances, - 'flip': self.flip, - 'clip': self.clip, - 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, - 'step_w': self.step_w, - 'step_h': self.step_h, - 'offset': self.offset - } - if len(self.max_sizes) > 0: - self.attrs['max_sizes'] = self.max_sizes - - self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} - - def test_check_output(self): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - pass - - def setUp(self): - self.op_type = "prior_box" - self.use_xpu = True - self.set_data() - - def set_max_sizes(self): - max_sizes = [5, 10] - self.max_sizes = np.array(max_sizes).astype('float32').tolist() - - def set_min_max_aspect_ratios_order(self): - self.min_max_aspect_ratios_order = False - - def init_test_params(self): - self.layer_w = 32 - self.layer_h = 32 - - self.image_w = 40 - self.image_h = 40 - - self.step_w = float(self.image_w) / float(self.layer_w) - self.step_h = float(self.image_h) / float(self.layer_h) - - self.input_channels = 2 - self.image_channels = 3 - self.batch_size = 10 - - self.min_sizes = [2, 4] - self.min_sizes = np.array(self.min_sizes).astype('float32').tolist() - self.set_max_sizes() - self.aspect_ratios = [2.0, 3.0] - self.flip = True - self.set_min_max_aspect_ratios_order() - self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] - self.aspect_ratios = np.array( - self.aspect_ratios, dtype=np.float).flatten() - self.variances = [0.1, 0.1, 0.2, 0.2] - self.variances = np.array(self.variances, dtype=np.float).flatten() - - self.clip = True - self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) - if len(self.max_sizes) > 0: - self.num_priors += len(self.max_sizes) - self.offset = 0.5 - - def init_test_input(self): - self.image = np.random.random( - (self.batch_size, self.image_channels, self.image_w, - self.image_h)).astype('float32') - - self.input = np.random.random( - (self.batch_size, self.input_channels, self.layer_w, - self.layer_h)).astype('float32') - - def init_test_output(self): - out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) - out_boxes = np.zeros(out_dim).astype('float32') - out_var = np.zeros(out_dim).astype('float32') - - idx = 0 - for h in range(self.layer_h): - for w in range(self.layer_w): - c_x = (w + self.offset) * self.step_w - c_y = (h + self.offset) * self.step_h - idx = 0 - for s in range(len(self.min_sizes)): - min_size = self.min_sizes[s] - if not self.min_max_aspect_ratios_order: - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - else: - c_w = c_h = min_size / 2. - out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, - (c_y - c_h) / self.image_h, - (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h] - idx += 1 - if len(self.max_sizes) > 0: - max_size = self.max_sizes[s] - # second prior: aspect_ratio = 1, - c_w = c_h = math.sqrt(min_size * max_size) / 2 - out_boxes[h, w, idx, :] = [ - (c_x - c_w) / self.image_w, (c_y - c_h) / - self.image_h, (c_x + c_w) / self.image_w, - (c_y + c_h) / self.image_h - ] - idx += 1 - - # rest of priors - for r in range(len(self.real_aspect_ratios)): - ar = self.real_aspect_ratios[r] - if abs(ar - 1.) < 1e-6: - continue - c_w = min_size * math.sqrt(ar) / 2 - c_h = (min_size / math.sqrt(ar)) / 2 +class XPUTestPriorBoxOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'prior_box' + self.use_dynamic_create_class = False + + class TestPriorBoxOp(XPUOpTest): + def setUp(self): + self.op_type = "prior_box" + self.use_xpu = True + self.dtype = self.in_type + self.set_data() + + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = {'Input': self.input, 'Image': self.image} + + self.attrs = { + 'min_sizes': self.min_sizes, + 'aspect_ratios': self.aspect_ratios, + 'variances': self.variances, + 'flip': self.flip, + 'clip': self.clip, + 'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order, + 'step_w': self.step_w, + 'step_h': self.step_h, + 'offset': self.offset + } + if len(self.max_sizes) > 0: + self.attrs['max_sizes'] = self.max_sizes + + self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def set_max_sizes(self): + max_sizes = [5, 10] + self.max_sizes = np.array(max_sizes).astype('float32').tolist() + + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = False + + def init_test_params(self): + self.layer_w = 32 + self.layer_h = 32 + + self.image_w = 40 + self.image_h = 40 + + self.step_w = float(self.image_w) / float(self.layer_w) + self.step_h = float(self.image_h) / float(self.layer_h) + + self.input_channels = 2 + self.image_channels = 3 + self.batch_size = 10 + + self.min_sizes = [2, 4] + self.min_sizes = np.array(self.min_sizes).astype('float32').tolist() + self.set_max_sizes() + self.aspect_ratios = [2.0, 3.0] + self.flip = True + self.set_min_max_aspect_ratios_order() + self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] + self.aspect_ratios = np.array( + self.aspect_ratios, dtype=np.float).flatten() + self.variances = [0.1, 0.1, 0.2, 0.2] + self.variances = np.array(self.variances, dtype=np.float).flatten() + + self.clip = True + self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) + if len(self.max_sizes) > 0: + self.num_priors += len(self.max_sizes) + self.offset = 0.5 + + def init_test_input(self): + self.image = np.random.random( + (self.batch_size, self.image_channels, self.image_w, + self.image_h)).astype(self.dtype) + + self.input = np.random.random( + (self.batch_size, self.input_channels, self.layer_w, + self.layer_h)).astype(self.dtype) + + def init_test_output(self): + out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) + out_boxes = np.zeros(out_dim).astype(self.dtype) + out_var = np.zeros(out_dim).astype(self.dtype) + + idx = 0 + for h in range(self.layer_h): + for w in range(self.layer_w): + c_x = (w + self.offset) * self.step_w + c_y = (h + self.offset) * self.step_h + idx = 0 + for s in range(len(self.min_sizes)): + min_size = self.min_sizes[s] + if not self.min_max_aspect_ratios_order: + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + else: + c_w = c_h = min_size / 2. out_boxes[h, w, idx, :] = [ (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h ] idx += 1 - - # clip the prior's coordidate such that it is within[0, 1] - if self.clip: - out_boxes = np.clip(out_boxes, 0.0, 1.0) - # set the variance. - out_var = np.tile(self.variances, (self.layer_h, self.layer_w, - self.num_priors, 1)) - self.out_boxes = out_boxes.astype('float32') - self.out_var = out_var.astype('float32') - - -class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): - def set_max_sizes(self): - self.max_sizes = [] - - -class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): - def set_min_max_aspect_ratios_order(self): - self.min_max_aspect_ratios_order = True - + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + if abs(ar - 1.) < 1e-6: + continue + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / + self.image_h, (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h + ] + idx += 1 + + # clip the prior's coordidate such that it is within[0, 1] + if self.clip: + out_boxes = np.clip(out_boxes, 0.0, 1.0) + # set the variance. + out_var = np.tile(self.variances, (self.layer_h, self.layer_w, + self.num_priors, 1)) + self.out_boxes = out_boxes.astype(self.dtype) + self.out_var = out_var.astype(self.dtype) + + class TestPriorBoxOpWithoutMaxSize(TestPriorBoxOp): + def set_max_sizes(self): + self.max_sizes = [] + + class TestPriorBoxOpWithSpecifiedOutOrder(TestPriorBoxOp): + def set_min_max_aspect_ratios_order(self): + self.min_max_aspect_ratios_order = True + + +support_types = get_xpu_op_support_types('prior_box') +for stype in support_types: + create_test_class(globals(), XPUTestPriorBoxOp, stype) if __name__ == '__main__': unittest.main() From 7d53a288f6b63e075085860ed51f6a4388d31353 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Wed, 16 Feb 2022 17:30:52 +0800 Subject: [PATCH 03/19] [fleet exe] Update comm init for dist model (#39603) --- .../distributed/fleet_executor/dist_model.cc | 131 ++++-------------- .../distributed/fleet_executor/dist_model.h | 8 +- paddle/fluid/pybind/bind_fleet_executor.cc | 11 +- 3 files changed, 35 insertions(+), 115 deletions(-) diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc index 40b0a8b55e17a..941d470f87935 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.cc +++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc @@ -53,7 +53,6 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data, } else if (input_data.dtype == DistModelDataType::INT32) { input_tensor_ptr = input_tensor->mutable_data(dims, place); } else { - // Q(fleet exe dev): for input/output, should we support fp16 LOG(ERROR) << "unsupported feed type " << input_data.dtype; return false; } @@ -113,14 +112,6 @@ std::string DistModelDTypeToString(DistModelDataType dtype) { return "NOT SUPPORT DTYPE"; } -bool IsPPFirstStage(const DistModelConfig &config) { - return config.local_rank - config.mp_degree < 0; -} - -bool IsPPLastStage(const DistModelConfig &config) { - return config.local_rank + config.mp_degree >= config.nranks; -} - class DistModelTimer { public: void tic() { tic_time = std::chrono::high_resolution_clock::now(); } @@ -197,65 +188,34 @@ bool DistModel::PreparePlace() { } bool DistModel::CommInit() { - // NOTE (Yuang Liu): The peer endpoints will be obtained with the assumption - // that mp part is always on inner side and pp part is always on outer side. - // TODO(fleet exe dev): The peer endpoints could be configured by users. - PADDLE_ENFORCE_EQ( - config_.pp_degree * config_.mp_degree, config_.nranks, - platform::errors::InvalidArgument( - "The mp_degree multiplies pp_degree is not equal with nranks")); std::unique_ptr comm_init_program( new framework::ProgramDesc()); framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0); - if (config_.mp_degree > 1) { - PADDLE_ENFORCE_GE( - config_.mp_ring_id, 0, - platform::errors::InvalidArgument( - "mp ring id must be provided for inference under mp.")); - VLOG(3) << "Init comm group for mp."; + std::vector &ring_ids = + config_.rank_to_ring_ids_[config_.local_rank]; + int64_t order = 0; + std::string var_name_base = "comm_init_"; + for (int64_t ring_id : ring_ids) { + VLOG(3) << "Init comm for ring id: " << ring_id; + int64_t ranks_in_group = config_.ring_id_to_ranks_[ring_id].size(); + int64_t rank_in_group = 0; + std::vector &ranks = config_.ring_id_to_ranks_[ring_id]; + for (int64_t rank : ranks) { + if (config_.local_rank == rank) { + break; + } + rank_in_group += 1; + } std::vector peer_endpoints; - for (int64_t - idx = (config_.local_rank / config_.mp_degree) * config_.mp_degree, - i = 0; - i < config_.mp_degree; ++idx, ++i) { - if (config_.trainer_endpoints[idx] == config_.current_endpoint) { + for (int64_t rank : ranks) { + if (config_.local_rank == rank) { continue; } - peer_endpoints.emplace_back(config_.trainer_endpoints[idx]); - } - // get nranks in a mp group and inner group rank for local rank - int64_t mp_group_nranks = config_.nranks / config_.pp_degree; - int64_t mp_group_rank = config_.local_rank % config_.mp_degree; - InsertCommOp("mp_comm_id", mp_group_nranks, mp_group_rank, peer_endpoints, - comm_init_block, config_.mp_ring_id); - } - if (config_.pp_degree > 1) { - VLOG(3) << "Init comm group for pp."; - if (!IsPPFirstStage(config_)) { - PADDLE_ENFORCE_EQ(config_.pp_upstream_ring_id >= 0, true, - platform::errors::InvalidArgument( - "pp upstream ring id must be provided for " - "non-first pp stage if inference under pp.")); - // not the first pp stage, has upstream - std::vector upstream_peer_endpoints; - upstream_peer_endpoints.emplace_back( - config_.trainer_endpoints[config_.local_rank - config_.mp_degree]); - InsertCommOp("pp_upstream_comm_id", 2, 1, upstream_peer_endpoints, - comm_init_block, config_.pp_upstream_ring_id); - } - - if (!IsPPLastStage(config_)) { - PADDLE_ENFORCE_EQ(config_.pp_downstream_ring_id >= 0, true, - platform::errors::InvalidArgument( - "pp downstream ring id must be provided for " - "non-last pp stage if inference under pp.")); - // not the last pp stage, has downstream - std::vector downstream_peer_endpoints; - downstream_peer_endpoints.emplace_back( - config_.trainer_endpoints[config_.local_rank + config_.mp_degree]); - InsertCommOp("pp_downstream_comm_id", 2, 0, downstream_peer_endpoints, - comm_init_block, config_.pp_downstream_ring_id); + peer_endpoints.emplace_back(config_.trainer_endpoints[rank]); } + InsertCommOp(var_name_base + std::to_string(order), ranks_in_group, + rank_in_group, peer_endpoints, comm_init_block, ring_id); + order += 1; } framework::NaiveExecutor e(place_); e.CreateVariables(*comm_init_program, 0, true, scope_.get()); @@ -409,12 +369,7 @@ bool DistModel::LoadParameters() { bool DistModel::PrepareFleetExe() { task_node_.reset(new TaskNode(program_.get(), config_.local_rank)); - if (config_.local_rank - config_.mp_degree >= 0) { - task_node_->AddUpstreamTask(config_.local_rank - config_.mp_degree); - } - if (config_.local_rank + config_.mp_degree < config_.nranks) { - task_node_->AddDownstreamTask(config_.local_rank + config_.mp_degree); - } + // With auto cut, there is no concept of pp, no need to add dependency. task_node_->SetType("Compute"); task_node_->Init(); executor_desc_ = FleetExecutorDesc(); @@ -473,40 +428,13 @@ bool DistModel::PrepareFeedAndFetch() { } } - if (config_.pp_degree == 1) { - if (feeds_.size() == 0) { - LOG(ERROR) << "No feed ops in the inf program, please check the program."; - return false; - } - if (fetches_.size() == 0) { - LOG(ERROR) << "No fetch op in the inf program, please check the program."; - return false; - } - } else { - if (IsPPFirstStage(config_)) { - if (feeds_.size() == 0) { - LOG(ERROR) << "Feed ops are needed for the first pp stage."; - return false; - } - } else { - if (feeds_.size() > 0) { - LOG(WARNING) << "Feed op is found in the non-first stage of pp."; - } else { - LOG(INFO) << "No feed ops in non-first pp stage."; - } - } - if (IsPPLastStage(config_)) { - if (fetches_.size() == 0) { - LOG(WARNING) << "No fetch op was found in the last pp stage. Make sure " - "the result has been sent to frist pp stage."; - } - } else { - if (fetches_.size() > 0) { - LOG(WARNING) << "Fetch op is found in the non-last stage of pp."; - } else { - LOG(INFO) << "No fetch op in non-last pp stage."; - } - } + if (feeds_.size() == 0) { + LOG(ERROR) << "No feed ops in the inf program, please check the program."; + return false; + } + if (fetches_.size() == 0) { + LOG(ERROR) << "No fetch op in the inf program, please check the program."; + return false; } return true; } @@ -606,7 +534,6 @@ bool DistModel::FetchResult(const framework::LoDTensor &fetch, bool DistModel::Run(const std::vector &input_data, std::vector *output_data) { - // TODO(fleet exe dev): support pipeline inf mode VLOG(3) << "DistModel run for once."; DistModelTimer timer; diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h index c980178b67c52..d0203c131357c 100644 --- a/paddle/fluid/distributed/fleet_executor/dist_model.h +++ b/paddle/fluid/distributed/fleet_executor/dist_model.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include #include #include @@ -47,12 +48,9 @@ struct DistModelConfig { std::string current_endpoint{}; int64_t nranks{1}; int64_t local_rank{0}; - int64_t mp_degree{1}; - int64_t pp_degree{1}; - int64_t mp_ring_id{-1}; - int64_t pp_upstream_ring_id{-1}; - int64_t pp_downstream_ring_id{-1}; bool enable_timer{false}; + std::map> ring_id_to_ranks_{}; + std::map> rank_to_ring_ids_{}; }; class DistModel { diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index 0422a9cf8cc0a..7bb7f03983eb9 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -151,14 +151,9 @@ void BindFleetExecutor(py::module* m) { .def_readwrite("current_endpoint", &DistModelConfig::current_endpoint) .def_readwrite("nranks", &DistModelConfig::nranks) .def_readwrite("local_rank", &DistModelConfig::local_rank) - .def_readwrite("mp_degree", &DistModelConfig::mp_degree) - .def_readwrite("pp_degree", &DistModelConfig::pp_degree) - .def_readwrite("mp_ring_id", &DistModelConfig::mp_ring_id) - .def_readwrite("enable_timer", &DistModelConfig::enable_timer) - .def_readwrite("pp_upstream_ring_id", - &DistModelConfig::pp_upstream_ring_id) - .def_readwrite("pp_downstream_ring_id", - &DistModelConfig::pp_downstream_ring_id); + .def_readwrite("ring_id_to_ranks", &DistModelConfig::ring_id_to_ranks_) + .def_readwrite("rank_to_ring_ids", &DistModelConfig::rank_to_ring_ids_) + .def_readwrite("enable_timer", &DistModelConfig::enable_timer); py::class_(*m, "DistModel") .def(py::init()) From 24b8f63efb61f0ddb0d40fea8d1f474d600b9ce1 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Wed, 16 Feb 2022 17:41:03 +0800 Subject: [PATCH 04/19] [MLU] fix TensorAdd for mlu (#39523) --- .../fluid/imperative/gradient_accumulator.cc | 32 +++++++++++++++++++ python/paddle/nn/functional/loss.py | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index dc8b3982ba998..17ab1f1f7c53f 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -35,6 +35,9 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif namespace paddle { namespace imperative { @@ -362,6 +365,35 @@ void TensorAdd(const VarType& src, VarType* dst) { } #endif +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place)) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::DeviceContext* ctx = pool.Get(place); + auto dev_ctx = dynamic_cast(ctx); + if (data_type == framework::DataTypeTrait::DataType()) { + dst_tensor->mutable_data(place); + } else if (data_type == + framework::DataTypeTrait::DataType()) { + dst_tensor->mutable_data(place); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } + static const float alpha = 1.f; + static const float beta = 1.f; + operators::MLUCnnlTensorDesc src_tensor_desc(src_tensor); + operators::MLUCnnlTensorDesc dst_tensor_desc(*dst_tensor); + PADDLE_ENFORCE_MLU_SUCCESS(cnnlAssignAdd( + dev_ctx->cnnl_handle(), static_cast(&alpha), + src_tensor_desc.get(), operators::GetBasePtr(&src_tensor), nullptr, 0, + static_cast(&beta), dst_tensor_desc.get(), + operators::GetBasePtr(dst_tensor))); + return; + } +#endif + PADDLE_TENSOR_ADD(float); #ifndef PADDLE_WITH_XPU diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 711fd1e94cae9..8dc040325934f 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1676,7 +1676,7 @@ def cross_entropy(input, if label_max >= input.shape[axis]: raise ValueError("label should not out of bound, but got{}". format(label_max)) - if core.is_compiled_with_npu(): + if core.is_compiled_with_npu() or core.is_compiled_with_mlu(): _, _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', ignore_index, 'numeric_stable_mode', True, 'axis', axis, From a909bdf154fa0d44124b73dcdb1c8c4205c83999 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 16 Feb 2022 18:55:42 +0800 Subject: [PATCH 05/19] [Eager] Support eager hook_for_layer (#39531) * Update comment * [Eager] Support test_imperative_hook_for_layer with _test_eager_guard() * Polish code name style * Fix a error name * Polish code, make it clear and simple --- python/paddle/fluid/dygraph/layers.py | 2 +- .../test_imperative_hook_for_layer.py | 64 +++++++++++-------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 6a65b3bd9c684..53dbf1a66b27f 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -342,7 +342,7 @@ def register_forward_pre_hook(self, hook): import paddle import numpy as np - # the forward_post_hook change the input of the layer: input = input * 2 + # the forward_pre_hook change the input of the layer: input = input * 2 def forward_pre_hook(layer, input): # user can use layer and input for information statistis tasks diff --git a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py index 317353684317f..4c457e9345c5d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_hook_for_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,14 +25,15 @@ import paddle.fluid.dygraph.base as base from test_imperative_lod_tensor_to_selected_rows import SimpleNet +from paddle.fluid.framework import _test_eager_guard -call_forward_hook = False +call_forward_post_hook = False call_forward_pre_hook = False -def forward_hook(layer, input, output): - global call_forward_hook - call_forward_hook = True +def forward_post_hook(layer, input, output): + global call_forward_post_hook + call_forward_post_hook = True def forward_pre_hook(layer, input): @@ -40,7 +41,7 @@ def forward_pre_hook(layer, input): call_forward_pre_hook = True -def forward_hook1(layer, input, output): +def forward_post_hook1(layer, input, output): return output * 2 @@ -50,8 +51,8 @@ def forward_pre_hook1(layer, input): class Test_Forward_Hook(unittest.TestCase): - # test forward_pre_hook and forward_hook that have return value - def test_forward_hook_return_value(self): + # test forward_pre_hook and forward_post_hook that have return value + def func_forward_hook_return_value(self): seed = 90 places = [fluid.CPUPlace()] @@ -104,23 +105,23 @@ def test_forward_hook_return_value(self): self.assertTrue( np.array_equal(outs_pre_hook.numpy(), outs_origin.numpy())) - # register forward_hook - forward_hook_handle1 = simplenet.register_forward_post_hook( - forward_hook1) + # register forward_posst_hook + forward_post_hook_handle1 = simplenet.register_forward_post_hook( + forward_post_hook1) outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy() * 2)) - # remove forward_hook - forward_hook_handle1.remove() + # remove forward_post_hook + forward_post_hook_handle1.remove() outs_forward_hook = simplenet(input, y) self.assertTrue( np.array_equal(outs_forward_hook.numpy(), outs_origin.numpy())) - # test forward_pre_hook and forward_hook that don't have return value - def test_forward_hook(self): + # test forward_pre_hook and forward_post_hook that don't have return value + def func_forward_hook(self): seed = 90 places = [fluid.CPUPlace()] @@ -133,7 +134,7 @@ def test_forward_hook(self): fluid.default_main_program().random_seed = seed fluid.set_flags({'FLAGS_sort_sum_gradient': True}) - global call_forward_hook + global call_forward_post_hook global call_forward_pre_hook input_word = np.array( @@ -158,38 +159,45 @@ def test_forward_hook(self): # origin, don't register any hook outs_origin = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertFalse(call_forward_pre_hook) - # register forward_hook and forward_pre_hook - forward_hook_handle = simplenet.register_forward_post_hook( - forward_hook) + # register forward_post_hook and forward_pre_hook + forward_post_hook_handle = simplenet.register_forward_post_hook( + forward_post_hook) forward_pre_hook_handle = simplenet.register_forward_pre_hook( forward_pre_hook) outs_hook = simplenet(input, y) - self.assertTrue(call_forward_hook) + self.assertTrue(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) outs_hook = simplenet(input, y) - self.assertTrue(call_forward_hook) + self.assertTrue(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) - # remove forward_hook - forward_hook_handle.remove() - call_forward_hook = False + # remove forward_post_hook + forward_post_hook_handle.remove() + call_forward_post_hook = False call_forward_pre_hook = False outs_remove_forward_hook = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertTrue(call_forward_pre_hook) # remove forward_pre_hook forward_pre_hook_handle.remove() - call_forward_hook = False + call_forward_post_hook = False call_forward_pre_hook = False outs_remove_hook = simplenet(input, y) - self.assertFalse(call_forward_hook) + self.assertFalse(call_forward_post_hook) self.assertFalse(call_forward_pre_hook) + def test_forward_hook_return_value(self): + with _test_eager_guard(): + self.func_forward_hook() + self.func_forward_hook_return_value() + self.func_forward_hook() + self.func_forward_hook_return_value() + if __name__ == '__main__': unittest.main() From 1354652be009d09ab13837bda5ff538f1d0991ff Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 17 Feb 2022 09:43:42 +0800 Subject: [PATCH 06/19] Modified distribution kernel with Kernel Primitive API (#39563) --- paddle/fluid/operators/distribution_helper.h | 35 +++++++----- .../kernels/primitive/compute_primitives.h | 53 +++++++++++++++++++ 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index a13ae57090687..f3bce38e3a7d8 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -28,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/platform/for_range.h" #include "paddle/pten/core/hostdevice.h" +#if defined(__NVCC__) || defined(__HIPCC__) +#include "paddle/pten/kernels/primitive/kernel_primitives.h" +#endif + #if !defined(_WIN32) #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) #else @@ -91,6 +95,8 @@ struct normal_transform { #if defined(__NVCC__) || defined(__HIPCC__) +namespace kps = pten::kps; + /*********************** Distribution Function *************************/ template struct uniform_distribution; @@ -176,25 +182,26 @@ template __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, DistOp dist, TransformOp trans, T *out_data) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - int32_t returns_count = DistOp::kReturnsCount; + size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); + static constexpr int kCount = DistOp::kReturnsCount; #if defined(__NVCC__) curandStatePhilox4_32_10_t state; - curand_init(seed, idx, offset, &state); + curand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = curandStatePhilox4_32_10_t; #else hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, idx, offset, &state); + hiprand_init(seed, idx + THREAD_ID_X, offset, &state); + using SType = hiprandStatePhilox4_32_10_t; #endif - size_t total_thread = gridDim.x * blockDim.x; - for (size_t i = idx; i < size; i += total_thread * returns_count) { - auto random_tuple = dist(&state); - for (size_t j = 0; j < returns_count; j++) { - size_t index = i + j * total_thread; - if (index < size) { - auto random = (&random_tuple.x)[j]; - out_data[index] = static_cast(trans(random)); - } - } + size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; + T args[kCount]; + T result[kCount]; + for (size_t i = idx; i < size; i += total_thread * kCount) { + kps::ElementwiseRandom(&args[0], dist, &state); + kps::ElementwiseUnary(&result[0], &args[0], + trans); + kps::WriteData(out_data + i, &result[0], size - i, + 1, total_thread, 1); } } diff --git a/paddle/pten/kernels/primitive/compute_primitives.h b/paddle/pten/kernels/primitive/compute_primitives.h index a8ed081622763..02a2f7baf780b 100644 --- a/paddle/pten/kernels/primitive/compute_primitives.h +++ b/paddle/pten/kernels/primitive/compute_primitives.h @@ -428,5 +428,58 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) { } } +template +__device__ __forceinline__ void ElementwiseRandom(OutT* out, + OpFunc compute, + StateType* state) { + auto random_tuple = compute(state); +#pragma unroll + for (int i = 0; i < ReturnsCount; i++) { + out[i] = static_cast((&random_tuple.x)[i]); + } +} + +// attention please set share_size = blockDim.x; +// data and b are the register pointer +#define shared_size 64 +template +__device__ __forceinline__ void Cumsum(OutT* out, + const InT* in, + OpFunc compute) { + __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32]; + int tidx = threadIdx.x; + temp[tidx + tidx / 32] = in[0]; + temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1]; + for (int stride = 1; stride <= blockDim.x; stride *= 2) { + __syncthreads(); + int index = (tidx + 1) * 2 * stride - 1; + if (index < (blockDim.x * 2)) { + temp[index + index / 32] += temp[index - stride + (index - stride) / 32]; + } + } + for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) { + __syncthreads(); + int index = (tidx + 1) * 2 * stride - 1; + if ((index + stride) < (blockDim.x * 2)) { + temp[index + stride + (stride + index) / 32] += + temp[index + (index) / 32]; + } + } + + __syncthreads(); + out[0] = static_cast(temp[tidx + tidx / 32]); + out[1] = + static_cast(temp[tidx + shared_size + (tidx + shared_size) / 32]); +} + } // namespace kps } // namespace pten From b4d3597a346eb18afa177a963144f10cf5348b08 Mon Sep 17 00:00:00 2001 From: baoachun <962571062@qq.com> Date: Thu, 17 Feb 2022 10:47:02 +0800 Subject: [PATCH 07/19] update inference ut to support nhwc format (#39551) * update inference ut to support nhwc format * update ut and pass OpCompat * update ut * update ut --- .../conv_activation_mkldnn_fuse_pass.cc | 2 +- .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc | 4 ++-- .../conv_concat_relu_mkldnn_fuse_pass.cc | 2 +- .../test_conv_act_mkldnn_fuse_pass.py | 8 +++---- .../test_conv_bias_mkldnn_fuse_pass.py | 14 +++++------- ...kldnn_conv_concat_relu_mkldnn_fuse_pass.py | 9 -------- .../test_mkldnn_conv_gelu_fuse_pass.py | 22 ------------------- ...test_mkldnn_conv_hard_sigmoid_fuse_pass.py | 9 -------- .../test_mkldnn_conv_hard_swish_fuse_pass.py | 9 -------- ...st_mkldnn_conv_transpose_bias_fuse_pass.py | 12 +++++----- 10 files changed, 20 insertions(+), 71 deletions(-) diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc index 6370d3380361c..453cfb85554ec 100755 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -157,7 +157,7 @@ ConvActivationFusePass::ConvActivationFusePass() { // IsStringIn({"NHWC", "NCHW"}) MobileNetV2 has no this attribute .AddAttr("data_format") .IsOptional() - .IsStringIn({"NCHW", "AnyLayout"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("relu")) diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index 9f70c829e1fb5..5d325037ad20e 100755 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -115,7 +115,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() { .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("data_format") - .IsStringIn({"NCHW"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) @@ -129,7 +129,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() { .IsTensor() .End() .AddAttr("axis") - .IsIntIn({1}) + .IsIntIn({1, 3}) .End(); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc index 0947f4756ad78..5fbfef08b7209 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc @@ -59,7 +59,7 @@ ConvConcatReLUFusePass::ConvConcatReLUFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("concat")) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py index d9b3b8e601755..d029bcd6a7f17 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_act_mkldnn_fuse_pass.py @@ -53,8 +53,6 @@ def is_program_valid(self, prog_config): data_format = prog_config.ops[0].attrs["data_format"] filter_shape = prog_config.weights["filter"].shape input_shape = prog_config.inputs["input_x"].shape - if data_format != "NCHW": - return False if padding_algorithm == "VALID": if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \ ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1: @@ -80,8 +78,8 @@ def sample_program_config(self, draw): x_shape = draw( st.lists( st.integers( - min_value=1, max_value=100), min_size=4, max_size=4)) - x_shape[1] = draw(st.integers(min_value=1, max_value=10)) + min_value=5, max_value=100), min_size=4, max_size=4)) + x_shape[1] = draw(st.integers(min_value=5, max_value=10)) # 2. Generate legal attr:data_format of conv2d data_format = draw(st.sampled_from(["NCHW", "NHWC"])) @@ -90,7 +88,7 @@ def sample_program_config(self, draw): f_shape = draw( st.lists( st.integers( - min_value=1, max_value=7), min_size=4, max_size=4)) + min_value=1, max_value=5), min_size=4, max_size=4)) if data_format == "NCHW": f_shape[1] = x_shape[1] else: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py index 40fd9a418b9b1..a0213c5b1f4df 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py @@ -53,8 +53,6 @@ def is_program_valid(self, prog_config): data_format = prog_config.ops[0].attrs["data_format"] filter_shape = prog_config.weights["filter"].shape input_shape = prog_config.inputs["input_x"].shape - if data_format != "NCHW": - return False if padding_algorithm == "VALID": if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \ ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1: @@ -80,8 +78,8 @@ def sample_program_config(self, draw): x_shape = draw( st.lists( st.integers( - min_value=1, max_value=100), min_size=4, max_size=4)) - x_shape[1] = draw(st.integers(min_value=1, max_value=10)) + min_value=5, max_value=100), min_size=4, max_size=4)) + x_shape[1] = draw(st.integers(min_value=5, max_value=10)) # 2. Generate legal attr:data_format of conv2d data_format = draw(st.sampled_from(["NCHW", "NHWC"])) @@ -90,7 +88,7 @@ def sample_program_config(self, draw): f_shape = draw( st.lists( st.integers( - min_value=1, max_value=7), min_size=4, max_size=4)) + min_value=1, max_value=4), min_size=4, max_size=4)) if data_format == "NCHW": f_shape[1] = x_shape[1] else: @@ -100,7 +98,7 @@ def sample_program_config(self, draw): strides = draw( st.lists( st.integers( - min_value=1, max_value=5), min_size=2, max_size=2)) + min_value=1, max_value=4), min_size=2, max_size=2)) # 5. Generate legal attr:padding_algorithm of conv2d padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"])) @@ -109,7 +107,7 @@ def sample_program_config(self, draw): padding = draw( st.lists( st.integers( - min_value=1, max_value=5), min_size=4, max_size=4)) + min_value=1, max_value=4), min_size=4, max_size=4)) # 7. Generate legal attr:groups of conv2d groups = draw(st.integers(min_value=1, max_value=3)) @@ -118,7 +116,7 @@ def sample_program_config(self, draw): dilations = draw( st.lists( st.integers( - min_value=1, max_value=5), min_size=2, max_size=2)) + min_value=1, max_value=4), min_size=2, max_size=2)) # 9. Generate legal shape of input:bias of elementwise_add bias_shape = [f_shape[0]] diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py index 3c823c73d3794..6654fbba264e0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_concat_relu_mkldnn_fuse_pass.py @@ -27,15 +27,6 @@ class TestConvConcatReluMkldnnFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[0]['data_format'] == "NHWC": - return False - return True def sample_program_config(self, draw): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py index aa779f6ecbc4f..33df428388882 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_gelu_fuse_pass.py @@ -27,15 +27,6 @@ class TestConvGeluMkldnnFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[0]['data_format'] == "NHWC": - return False - return True def sample_program_config(self, draw): @@ -108,19 +99,6 @@ def sample_predictor_configs(self, program_config): config = self.create_inference_config(use_mkldnn=True) yield config, ["conv2d"], (1e-5, 1e-5) - # If the problem has been fixed, the judgment - # needs to be deleted!!! - def add_ignore_pass_case(self): - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs['data_format'] == "NHWC": - return True - return False - - self.add_ignore_check_case( - teller1, SkipReasons.PASS_ACCURACY_ERROR, - "The output format of conv2d is wrong when data_format attribute is NHWC" - ) - def test(self): self.run_and_statis(quant=False, passes=["conv_gelu_mkldnn_fuse_pass"]) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py index a0c4e183930a5..2eb071d6eb83b 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_sigmoid_fuse_pass.py @@ -27,15 +27,6 @@ class TestConvHardSigmoidMkldnnFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[0]['data_format'] == "NHWC": - return False - return True def sample_program_config(self, draw): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py index 17bfb625fd37b..990489c32136a 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_hard_swish_fuse_pass.py @@ -27,15 +27,6 @@ class TestConvHardSwishMkldnnFusePass(PassAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: - attrs = [ - program_config.ops[i].attrs - for i in range(len(program_config.ops)) - ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[0]['data_format'] == "NHWC": - return False - return True def sample_program_config(self, draw): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py index 5df7cb8d8cec3..c5cedac226149 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_transpose_bias_fuse_pass.py @@ -32,9 +32,9 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool: for i in range(len(program_config.ops)) ] - # If the problem has been fixed, the judgment - # needs to be deleted!!! - if attrs[0]['data_format'] == "NHWC": + if attrs[0]['data_format'] == "NCHW" and attrs[1]["axis"] == 3: + return False + if attrs[0]['data_format'] == "NHWC" and attrs[1]["axis"] == 1: return False return True @@ -46,7 +46,7 @@ def sample_program_config(self, draw): groups = draw(st.sampled_from([1, 2, 4, 8])) paddings = draw(st.sampled_from([[0, 3], [1, 2, 3, 4]])) strides = draw(st.sampled_from([[1, 1], [2, 2], [1, 2]])) - axis = draw(st.sampled_from([1])) + axis = draw(st.sampled_from([1, 3])) batch_size = draw(st.integers(min_value=1, max_value=4)) def generate_input(): @@ -110,7 +110,9 @@ def sample_predictor_configs(self, program_config): def test(self): self.run_and_statis( - quant=False, passes=["conv_transpose_bias_mkldnn_fuse_pass"]) + quant=False, + max_duration=300, + passes=["conv_transpose_bias_mkldnn_fuse_pass"]) if __name__ == "__main__": From d63ece1f764e2e419038ce35c607e821dd87be9a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Feb 2022 13:41:25 +0800 Subject: [PATCH 08/19] [PTen] Remove fluid device context deps (#39604) * remove fluid device context deps * fix compile failde --- paddle/pten/core/kernel_utils.h | 4 +++- paddle/pten/kernels/gpu/copy_kernel.cu | 1 + paddle/pten/tests/api/test_matmul_api.cc | 2 ++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index 7c611d7eccd11..8bc125c50bed6 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -14,7 +14,9 @@ #pragma once -#include "paddle/pten/backends/all_context.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/backends/xpu/xpu_context.h" #include "paddle/pten/common/scalar.h" #include "paddle/pten/common/scalar_array.h" #include "paddle/pten/core/dense_tensor.h" diff --git a/paddle/pten/kernels/gpu/copy_kernel.cu b/paddle/pten/kernels/gpu/copy_kernel.cu index 5d6229f09f015..21426698d43b7 100644 --- a/paddle/pten/kernels/gpu/copy_kernel.cu +++ b/paddle/pten/kernels/gpu/copy_kernel.cu @@ -21,6 +21,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" namespace pten { diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc index 4d3adf86d166d..903ced28c1bbe 100644 --- a/paddle/pten/tests/api/test_matmul_api.cc +++ b/paddle/pten/tests/api/test_matmul_api.cc @@ -23,6 +23,8 @@ limitations under the License. */ #include "paddle/pten/core/kernel_registry.h" #include "paddle/pten/kernels/copy_kernel.h" +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace tests { From 5fb9cf6065fcd9d8a534569bc709dbb3e348e71b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Feb 2022 13:56:42 +0800 Subject: [PATCH 09/19] support set fp32 input for fp16 kernel (#39625) --- paddle/pten/core/kernel_registry.h | 18 ++++----- paddle/pten/core/type_defs.h | 2 +- paddle/pten/tests/core/test_kernel_factory.cc | 39 +++++++++++++++++++ 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index 927c36e9e8f43..df95f50c98106 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -184,7 +184,7 @@ struct KernelRegistrar { KernelKey kernel_key(backend, layout, dtype); Kernel kernel(kernel_fn, variadic_kernel_fn); args_parse_fn(kernel_key, kernel.mutable_args_def()); - args_def_fn(&kernel); + args_def_fn(kernel_key, &kernel); KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; } }; @@ -231,7 +231,7 @@ struct KernelRegistrar { kernel_name, backend, layout, meta_kernel_fn, ...) \ PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__); \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel); \ PT_KERNEL_REGISTRAR_INIT( \ kernel_name, \ backend, \ @@ -240,7 +240,7 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__); \ void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel) #else /** * `template decltype(fn) fn` can work on gcc and clang, @@ -257,7 +257,7 @@ struct KernelRegistrar { #define _PT_REGISTER_2TA_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, ...) \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel); \ PT_EXPAND(PT_KERNEL_REGISTRAR_INIT( \ kernel_name, \ backend, \ @@ -266,7 +266,7 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)); \ void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel) #endif #define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \ @@ -786,7 +786,7 @@ struct KernelRegistrar { kernel_name, backend, layout, kernel_fn, dtype) \ template decltype(kernel_fn) kernel_fn; \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel); \ static const ::pten::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ #kernel_name, \ @@ -800,12 +800,12 @@ struct KernelRegistrar { return 0; \ } \ void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel) #else #define _PT_REGISTER_GENERAL_KERNEL( \ kernel_name, backend, layout, kernel_fn, dtype) \ static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel*); \ + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel); \ static const ::pten::KernelRegistrar \ __reg_pt_kernel_##kernel_name##_##backend##_##layout( \ #kernel_name, \ @@ -819,7 +819,7 @@ struct KernelRegistrar { return 0; \ } \ void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \ - ::pten::Kernel* kernel) + const ::pten::KernelKey& kernel_key, ::pten::Kernel* kernel) #endif /** PT_DECLARE_KERNEL diff --git a/paddle/pten/core/type_defs.h b/paddle/pten/core/type_defs.h index 9b91720d86f1e..4ecc12fcdef01 100644 --- a/paddle/pten/core/type_defs.h +++ b/paddle/pten/core/type_defs.h @@ -27,7 +27,7 @@ class ArgumentMappingContext; class InferMetaContext; using KernelFn = std::function; -using KernelArgsDefFn = void (*)(Kernel* kernel); +using KernelArgsDefFn = void (*)(const KernelKey& kernel_key, Kernel* kernel); using KernelArgsParseFn = void (*)(const KernelKey& default_key, KernelArgsDef* args_def); diff --git a/paddle/pten/tests/core/test_kernel_factory.cc b/paddle/pten/tests/core/test_kernel_factory.cc index 5355921ddbe01..c9e8dffe56ff9 100644 --- a/paddle/pten/tests/core/test_kernel_factory.cc +++ b/paddle/pten/tests/core/test_kernel_factory.cc @@ -15,6 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/pten/common/float16.h" +#include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_factory.h" #include "paddle/pten/core/kernel_registry.h" @@ -47,5 +49,42 @@ TEST(KernelFactory, SelectedKernelMap) { } } +template +void TestKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& param, + DenseTensor* out) {} + +TEST(KernelRegistry, SetFP32Input) { + pten::KernelKey kernel_key(pten::Backend::CPU, + pten::DataLayout::ALL_LAYOUT, + pten::DataType::FLOAT16); + auto test_kernel = + pten::KernelFactory::Instance().SelectKernel("test", kernel_key); + EXPECT_TRUE(test_kernel.IsValid()); + auto& arg_defs = test_kernel.args_def(); + auto& input_defs = arg_defs.input_defs(); + auto& attr_defs = arg_defs.attribute_defs(); + auto& output_defs = arg_defs.output_defs(); + EXPECT_EQ(input_defs.size(), 2UL); + EXPECT_EQ(attr_defs.size(), 0UL); + EXPECT_EQ(output_defs.size(), 1UL); + EXPECT_EQ(input_defs.at(0).dtype, pten::DataType::FLOAT16); + EXPECT_EQ(input_defs.at(1).dtype, pten::DataType::FLOAT32); + EXPECT_EQ(output_defs.at(0).dtype, pten::DataType::FLOAT16); +} + } // namespace tests } // namespace pten + +PT_REGISTER_KERNEL(test, + CPU, + ALL_LAYOUT, + pten::tests::TestKernel, + float, + double, + pten::dtype::float16) { + if (kernel_key.dtype() == pten::DataType::FLOAT16) { + kernel->InputAt(1).SetDataType(pten::DataType::FLOAT32); + } +} From 1c9b2483be14a8cfcf673168e3c70b9e87d20067 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Feb 2022 14:30:53 +0800 Subject: [PATCH 10/19] move trace infer shape (#39517) --- paddle/fluid/operators/trace_op.cc | 59 ++++-------------------------- paddle/pten/infermeta/unary.cc | 53 ++++++++++++++++++++++++++- paddle/pten/infermeta/unary.h | 4 ++ 3 files changed, 63 insertions(+), 53 deletions(-) diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index aabad64c894df..6145db5f5ef63 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -12,8 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/pten/core/infermeta_utils.h" +#include "paddle/pten/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,57 +24,6 @@ namespace operators { class TraceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Input"), true, - platform::errors::NotFound("Input of TraceOp is not found.")); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output of TraceOp is not found.")); - - int dim1 = ctx->Attrs().Get("axis1"); - int dim2 = ctx->Attrs().Get("axis2"); - - auto x_dims = ctx->GetInputDim("Input"); - - int dim1_ = dim1 < 0 ? x_dims.size() + dim1 : dim1; - int dim2_ = dim2 < 0 ? x_dims.size() + dim2 : dim2; - - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::OutOfRange( - "Input's dim is out of range (expected at least 2, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - dim1_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(dim1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), dim1)); - PADDLE_ENFORCE_LT( - dim2_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(dim2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), dim2)); - PADDLE_ENFORCE_NE(dim1_, dim2_, - platform::errors::InvalidArgument( - "The dimensions should not be identical " - "%ld vs %ld.", - dim1, dim2)); - - auto sizes = vectorize(x_dims); - if (x_dims.size() == 2) { - sizes.clear(); - sizes.push_back(1); - } else { - sizes.erase(sizes.begin() + std::max(dim1_, dim2_)); - sizes.erase(sizes.begin() + std::min(dim1_, dim2_)); - } - ctx->SetOutputDim("Out", framework::make_ddim(sizes)); - } }; class TraceOpMaker : public framework::OpProtoAndCheckerMaker { @@ -155,9 +107,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input"); } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, + PT_INFER_META(pten::TraceInferMeta)); REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, - ops::TraceGradOpMaker); + ops::TraceGradOpMaker, + TraceInferShapeFunctor); REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad, ops::TraceGradNoNeedBufferVarsInferer); diff --git a/paddle/pten/infermeta/unary.cc b/paddle/pten/infermeta/unary.cc index ca59937399a22..ec9ba519b95ba 100644 --- a/paddle/pten/infermeta/unary.cc +++ b/paddle/pten/infermeta/unary.cc @@ -444,8 +444,59 @@ void SplitInferMeta(const MetaTensor& x, (*out)[i].share_lod(x); } } +} + +void TraceInferMeta( + const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out) { + int dim1 = axis1; + int dim2 = axis2; + + auto x_dims = x.dims(); - return; + int dim1_ = dim1 < 0 ? x_dims.size() + dim1 : dim1; + int dim2_ = dim2 < 0 ? x_dims.size() + dim2 : dim2; + + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + pten::errors::OutOfRange( + "Input's dim is out of range (expected at least 2, but got %ld).", + x_dims.size())); + PADDLE_ENFORCE_LT( + dim1_, + x_dims.size(), + pten::errors::OutOfRange( + "Attr(dim1) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + dim1)); + PADDLE_ENFORCE_LT( + dim2_, + x_dims.size(), + pten::errors::OutOfRange( + "Attr(dim2) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), + (x_dims.size() - 1), + dim2)); + PADDLE_ENFORCE_NE( + dim1_, + dim2_, + pten::errors::InvalidArgument("The dimensions should not be identical " + "%ld vs %ld.", + dim1, + dim2)); + + auto sizes = vectorize(x_dims); + if (x_dims.size() == 2) { + sizes.clear(); + sizes.push_back(1); + } else { + sizes.erase(sizes.begin() + std::max(dim1_, dim2_)); + sizes.erase(sizes.begin() + std::min(dim1_, dim2_)); + } + out->set_dims(framework::make_ddim(sizes)); } } // namespace pten diff --git a/paddle/pten/infermeta/unary.h b/paddle/pten/infermeta/unary.h index 4c816c4adbc23..5bdf1d491c634 100644 --- a/paddle/pten/infermeta/unary.h +++ b/paddle/pten/infermeta/unary.h @@ -80,4 +80,8 @@ void SplitInferMeta(const MetaTensor& x_meta, const Scalar& axis, std::vector* out, MetaConfig config = MetaConfig()); + +void TraceInferMeta( + const MetaTensor& x, int offset, int axis1, int axis2, MetaTensor* out); + } // namespace pten From 1035d21f5002361c82190dc36aca210d41e1d69e Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Thu, 17 Feb 2022 01:03:24 -0600 Subject: [PATCH 11/19] refine data loader api in infrt (#39580) * update generate_pd_op_dialect_from_paddle_op_maker.py * update mlir tensor load interface * refine * fix bug * fix * refine * fix * 3 * fix * codestyle Co-authored-by: weishengying <1343838695@qq.com> --- paddle/infrt/dialect/dense_tensor.td | 3 +- paddle/infrt/host_context/paddle_mlir.cc | 13 +- paddle/infrt/kernel/tensor_kernels.cc | 9 +- .../tests/dialect/tensor/tensor_map.mlir.in | 3 +- paddle/scripts/infrt_build.sh | 2 +- ...rate_pd_op_dialect_from_paddle_op_maker.py | 147 ++++++++++++------ 6 files changed, 115 insertions(+), 62 deletions(-) diff --git a/paddle/infrt/dialect/dense_tensor.td b/paddle/infrt/dialect/dense_tensor.td index 7156e22951225..75c8a0d88e4c1 100644 --- a/paddle/infrt/dialect/dense_tensor.td +++ b/paddle/infrt/dialect/dense_tensor.td @@ -112,6 +112,7 @@ def LoadParamsOp : DT_Op<"load_params", [NoSideEffect]> { let verifier = ?; } + def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { let summary = "dt.tensor_map_get_tensor operation"; @@ -122,7 +123,7 @@ def TensorMapGetTensorOp : DT_Op<"tensor_map_get_tensor", [NoSideEffect]> { // input path of model params. let arguments = (ins TensorMapType:$map, - StringType:$name + StrAttr:$name ); let results = (outs TensorType:$output); let assemblyFormat = "`(` operands `)` attr-dict `->` type($output)"; diff --git a/paddle/infrt/host_context/paddle_mlir.cc b/paddle/infrt/host_context/paddle_mlir.cc index 475e1e8816820..1c36b04f366bf 100644 --- a/paddle/infrt/host_context/paddle_mlir.cc +++ b/paddle/infrt/host_context/paddle_mlir.cc @@ -171,7 +171,7 @@ void MLIRModelGenImpl::UpdateModelParams( builder_, &precision_); mlir::Type type_ = mlir::RankedTensorType::get(dims, precision_); - auto op = builder_.create( + auto op = builder_.create( mlir::UnknownLoc::get(context_), type_, map, name); params_map_.insert(std::pair( var_desc.name(), op.getOperation()->getResult(0))); @@ -224,15 +224,14 @@ llvm::SmallVector MLIRModelGenImpl::GetOpInputValue( const infrt::paddle::framework_proto::OpDesc &op_) { llvm::SmallVector operands; - std::vector inputs_info = {}; + std::unordered_map inputs_info = {}; if (pd_dialect_inputs_info_map_.count(op_.type())) inputs_info = pd_dialect_inputs_info_map_.at(op_.type()); for (int var_idx = 0; var_idx < op_.inputs_size(); ++var_idx) { auto &var = op_.inputs(var_idx); if (!var.arguments().empty()) { - if (!std::count(inputs_info.begin(), inputs_info.end(), var.parameter())) - continue; + if (!inputs_info.count(var.parameter())) continue; operands.push_back((params_map_[var.arguments()[0]])); } } @@ -243,7 +242,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( const infrt::paddle::framework_proto::OpDesc &op_) { llvm::SmallVector resultTypes; - std::vector pd_dialect_outputs_info = {}; + std::unordered_map pd_dialect_outputs_info = {}; if (pd_dialect_outputs_info_map_.count(op_.type())) pd_dialect_outputs_info = pd_dialect_outputs_info_map_.at(op_.type()); @@ -251,9 +250,7 @@ llvm::SmallVector MLIRModelGenImpl::GetOpOutputType( for (int var_idx = 0; var_idx < op_.outputs_size(); ++var_idx) { auto &var_name = op_.outputs(var_idx).arguments()[0]; - if (!std::count(pd_dialect_outputs_info.begin(), - pd_dialect_outputs_info.end(), - op_.outputs(var_idx).parameter())) + if (!pd_dialect_outputs_info.count(op_.outputs(var_idx).parameter())) continue; // update persistable tensors diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index c6e28c4c79d29..1e55bcd07ae80 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -54,10 +54,11 @@ TensorMap LoadParams(const std::string &path) { } void TensorMapGetTensor(TensorMap map, - const std::string &name, - DenseHostTensor *out) { - auto it = map.find(name); - CHECK(it != map.end()) << "No tensor called " << name << " in the TensorMap"; + DenseHostTensor *out, + Attribute name) { + auto it = map.find(name.get()); + CHECK(it != map.end()) << "No tensor called " << name.get() + << " in the TensorMap"; *out = *it->second; } diff --git a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in index 3baa6bcd42050..4edb918b5a28f 100644 --- a/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in +++ b/paddle/infrt/tests/dialect/tensor/tensor_map.mlir.in @@ -6,8 +6,7 @@ func @load_tensor_map() { %size = dt.tensor_map_get_size(%map) -> i32 infrt.print.i32 %size - %tensor_name = infrt.get_string("fc_bias") - %a = dt.tensor_map_get_tensor(%map, %tensor_name) -> !infrt.tensor + %a = dt.tensor_map_get_tensor(%map) {name="fc_bias"} -> !infrt.tensor // CHECK: tensor: shape=shape[2], values=[0, 0] dt.print_tensor (%a : !infrt.tensor) diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index f76fa497d6a03..ff86e7f52d535 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -90,7 +90,7 @@ function infrt_gen_and_build() { exit 7; fi - make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec infrt_lib_dist;build_error=$? + make -j ${parallel_number} infrt infrtopt infrtexec test_infrt_exec trt-exec infrt_lib_dist paddle-mlir-convert;build_error=$? if [ "$build_error" != 0 ];then exit 7; fi diff --git a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py index f77ef86cc6c43..027dfe4328a55 100644 --- a/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py +++ b/tools/infrt/generate_pd_op_dialect_from_paddle_op_maker.py @@ -110,10 +110,92 @@ def get_all_ops_desc(): return all_op_protos_dict +def generate_all_ops_inputs_outputs_map(op_descs): + # 1. Collect input and output name information of each Op + original_ops_ = get_original_ops() + ops_inputs_map = {} + ops_outputs_map = {} + for op_type, op_proto in op_descs.items(): + if op_type not in original_ops_: + continue + inputs = list() + outpus = list() + for input_ in op_proto[INPUTS]: + if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][ + input_][INTERMEDIATE] != True: + inputs.append(input_) + for output_ in op_proto[OUTPUTS]: + if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[OUTPUTS][ + output_][INTERMEDIATE] != True: + outpus.append(output_) + ops_inputs_map[op_type] = inputs + ops_outputs_map[op_type] = outpus + + # 2. Generate Cpp style map str + cpp_style_ops_inputs_map_str = "" + start_ = "#include \n#include \n#include \n" + \ + "const std::unordered_map> pd_dialect_inputs_info_map_ = {\n" + ops_inputs_str = "" + for ele in ops_inputs_map.items(): + op_name = ele[0] + op_inputs = ele[1] + op_inputs_str = "{" + input_idx = 0 + for op_input in op_inputs: + op_input_str = '{left_brace}"{op_input}", {input_idx}{right_brace}, '.format( + left_brace="{", + op_input=op_input, + input_idx=input_idx, + right_brace="}") + input_idx = input_idx + 1 + op_inputs_str = op_inputs_str + op_input_str + op_inputs_str = op_inputs_str[:-2] + "}" + pair = '{left_brace}"{op_name}", {op_inputs}{right_brace},\n'.format( + left_brace="{", + op_name=op_name, + op_inputs=op_inputs_str, + right_brace="}") + ops_inputs_str = ops_inputs_str + " " + pair + ops_inputs_str = ops_inputs_str[:-2] + cpp_style_ops_inputs_map_str = start_ + ops_inputs_str + "\n};" + + cpp_style_ops_outputs_map_str = "" + start_ = "const std::unordered_map> pd_dialect_outputs_info_map_ = {\n" + ops_outputs_str = "" + for ele in ops_outputs_map.items(): + op_name = ele[0] + op_outputs = ele[1] + op_outputs_str = "{" + output_idx = 0 + for op_output in op_outputs: + op_output_str = '{left_brace}"{op_output}", {output_idx}{right_brace}, '.format( + left_brace="{", + op_output=op_output, + output_idx=output_idx, + right_brace="}") + output_idx = output_idx + 1 + op_outputs_str = op_outputs_str + op_output_str + op_outputs_str = op_outputs_str[:-2] + "}" + pair = '{left_brace}"{op_name}", {op_outputs}{right_brace},\n'.format( + left_brace="{", + op_name=op_name, + op_outputs=op_outputs_str, + right_brace="}") + ops_outputs_str = ops_outputs_str + " " + pair + ops_outputs_str = ops_outputs_str[:-2] + cpp_style_ops_outputs_map_str = start_ + ops_outputs_str + "\n};" + + # 3. Write to header file + dst_head_file = "../../paddle/infrt/dialect/pd_ops_info.h" + with open(dst_head_file, 'w') as ops_inputs_outputs_head_file: + ops_inputs_outputs_head_file.write(cpp_style_ops_inputs_map_str) + ops_inputs_outputs_head_file.write("\n\n") + ops_inputs_outputs_head_file.write(cpp_style_ops_outputs_map_str) + + # funtion to generate paddle op dialect file def convert_op_proto_into_mlir(op_descs): dst_dialect_file = "../../paddle/infrt/dialect/pd_ops.td" - dialect_info_file = "../../paddle/infrt/dialect/pd_ops_info.h" custom_dialect_file = "custom_pdop.td" # 1. Head files @@ -153,41 +235,38 @@ def convert_op_proto_into_mlir(op_descs): original_ops_ = get_original_ops() automatically_generated_op_dialect = [] - ops_inputs_map_ = {} - ops_outputs_map_ = {} - for op_type, op_proto in op_descs.items(): if (op_type in skipped_op_list) or (op_type not in original_ops_): continue automatically_generated_op_dialect.append(op_type) # 2.1 OpDef - HEAD = "def PD_" + op_type.capitalize( - ) + "Op : PD_Op<\"" + op_type + "\", [NoSideEffect]> {\n" - SUMMARY = " let summary = \"" + op_type + " op\";\n" + HEAD = 'def PD_{op_type_capitalize}Op : PD_Op<"{op_type}", [NoSideEffect]> {left_brace}\n'.format( + op_type_capitalize=op_type.capitalize(), + op_type=op_type, + left_brace="{") + SUMMARY = ' let summary = "{} op";\n'.format(op_type) CANONICALIZATION = "let hasCanonicalizer = 1;" if op_type in ops_having_canonicalization else "" # 2.2 Description - DESCRIPTION = " let description = [{\n" - contents = (op_proto[COMMENT]).split("\n") - for line_ in contents: - DESCRIPTION = DESCRIPTION + " " + line_ + "\n" - DESCRIPTION += " }];\n" + contents = "" + origin_contents = (op_proto[COMMENT]).split("\n") + for line_ in origin_contents: + contents = contents + " {}\n".format(line_) + DESCRIPTION = " let description = [{left_brace}\n{description} {right_brace}];\n".format( + left_brace="{", description=contents, right_brace="}") # 2.3 arguments info ARGUMENTS = "" if (len(op_proto[INPUTS]) > 0 or len(op_proto[ATTRS]) > 0): ARGUMENTS = " let arguments = (ins " # 2.3.1 inputs - ins_cache_list_ = [] for input_ in op_proto[INPUTS]: if op_proto[INPUTS][input_][EXTRA] != True and op_proto[INPUTS][ input_][INTERMEDIATE] != True: - ins_cache_list_.append(input_) if op_proto[INPUTS][input_][DUPLICABLE] != "true": ARGUMENTS = ARGUMENTS + " PD_Tensor:$" + input_ + "," else: ARGUMENTS = ARGUMENTS + " PD_Tensor_Array:$" + input_ + "," - ops_inputs_map_[op_type] = ins_cache_list_ # unsupported: BLOCK = 8; BLOCKS = 10; attr_mlir_converter = { 0: 'SI32Attr', @@ -252,19 +331,17 @@ def convert_op_proto_into_mlir(op_descs): # 2.4 results info RESULTS = "" if (len(op_proto[OUTPUTS]) > 0): - RESULTS = "\n let results = (outs " - outs_cache_list_ = [] + outputs = "" for output_ in op_proto[OUTPUTS]: if op_proto[OUTPUTS][output_][EXTRA] != True and op_proto[ OUTPUTS][output_][INTERMEDIATE] != True: - outs_cache_list_.append(output_) if op_proto[OUTPUTS][output_][DUPLICABLE] != "true": - RESULTS = RESULTS + "PD_Tensor:$" + output_ + "," + outputs = outputs + "PD_Tensor:${},".format(output_) else: - RESULTS = RESULTS + "PD_Tensor_Array:$" + output_ + "," - print(HEAD + " PD_Tensor_Array:$" + output_ + ",") - ops_outputs_map_[op_type] = outs_cache_list_ - RESULTS = RESULTS[:-1] + ");\n" + outputs = outputs + "PD_Tensor_Array:${},".format( + output_) + RESULTS = "\n let results = (outs {});\n".format(outputs[:-1]) + with open(dst_dialect_file, 'a') as ops_mlir_file: ops_mlir_file.write(HEAD) ops_mlir_file.write(SUMMARY) @@ -278,29 +355,6 @@ def convert_op_proto_into_mlir(op_descs): print("Automatically generated op dialects num: " + str( len(automatically_generated_op_dialect))) - with open(dialect_info_file, 'w') as pd_ops_info_file: - pd_ops_info_file.write( - "#include\n#include\n#include\n") - pd_ops_info_file.write( - "const std::map> pd_dialect_inputs_info_map_ = {\n" - ) - for data_ in ops_inputs_map_: - pd_ops_info_file.write(" {\"" + data_ + "\", {") - for var_ in ops_inputs_map_[data_]: - pd_ops_info_file.write("\"" + var_ + "\",") - pd_ops_info_file.write("}},\n") - pd_ops_info_file.write("};\n") - - pd_ops_info_file.write( - "const std::map> pd_dialect_outputs_info_map_ = {\n" - ) - for data_ in ops_outputs_map_: - pd_ops_info_file.write(" {\"" + data_ + "\", {") - for var_ in ops_outputs_map_[data_]: - pd_ops_info_file.write("\"" + var_ + "\",") - pd_ops_info_file.write("}},\n") - pd_ops_info_file.write("};\n") - # 3. custom op dialect and end of file with open(dst_dialect_file, 'a') as ops_mlir_file: with open(custom_dialect_file, 'r') as custom_ops_file: @@ -313,4 +367,5 @@ def convert_op_proto_into_mlir(op_descs): if __name__ == "__main__": all_op_protos_dict = get_all_ops_desc() + generate_all_ops_inputs_outputs_map(all_op_protos_dict) convert_op_proto_into_mlir(all_op_protos_dict) From 8f2d14adaff7155de76e572eb6019890825b507b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Thu, 17 Feb 2022 15:09:52 +0800 Subject: [PATCH 12/19] change classes to pten, test=develop (#39643) --- paddle/infrt/CMakeLists.txt | 6 +- paddle/infrt/backends/host/pten_allocator.h | 33 ++++++++ paddle/infrt/backends/host/pten_context.h | 26 ++++++ paddle/infrt/dialect/pten/CMakeLists.txt | 1 + .../infrt/dialect/pten/infrt_pten_kernel.td | 26 ++++++ paddle/infrt/dialect/pten/infrt_pten_tensor.h | 1 + .../infrt/dialect/pten/infrt_pten_tensor.td | 82 ++++--------------- paddle/infrt/dialect/pten/pten_base.cc | 28 +++++-- paddle/infrt/host_context/kernel_registry.h | 3 +- .../host_context/kernel_registry_test.cc | 2 +- paddle/infrt/host_context/mlir_exec.cc | 6 ++ paddle/infrt/host_context/value.cc | 34 ++++---- paddle/infrt/host_context/value.h | 37 +++++++-- paddle/infrt/kernel/CMakeLists.txt | 2 + paddle/infrt/kernel/pten/CMakeLists.txt | 19 +++++ paddle/infrt/kernel/pten/allocator_kernels.cc | 25 ++++++ .../pten/allocator_kernels.h} | 19 ++--- paddle/infrt/kernel/pten/context_kernels.cc | 25 ++++++ .../pten/context_kernels.h} | 24 ++---- .../infrt/kernel/pten/dense_tensor_kernels.cc | 38 +++++++++ .../dense_tensor_kernels.h} | 29 +++---- .../pten}/infershaped/elementwise_add.h | 36 ++++---- .../infershaped/infershape_launchers_test.cc | 36 ++++---- .../infershaped_kernel_launcher.cc | 19 +++-- .../infershaped/infershaped_kernel_launcher.h | 8 +- .../infershaped_kernel_launchers.cc | 36 ++++++++ .../infershaped_kernel_launchers.h | 10 +-- .../pten}/infershaped/infershaped_utils.h | 6 +- paddle/infrt/kernel/pten/registry.cc | 61 ++++++++++++++ .../{pten_kernels.h => pten/registry.h} | 0 paddle/infrt/naive/CMakeLists.txt | 8 -- .../naive/infershaped/infershaped_registry.cc | 55 ------------- .../naive/infershaped/infershaped_registry.h | 56 ------------- paddle/infrt/naive/meta_tensor.h | 47 ----------- .../tests/dialect/pten/dense_tensor.mlir | 9 +- paddle/pten/backends/cpu/cpu_context.cc | 4 + paddle/pten/backends/cpu/cpu_context.h | 2 + paddle/pten/core/device_context.cc | 2 + paddle/pten/core/device_context.h | 5 ++ 39 files changed, 499 insertions(+), 367 deletions(-) create mode 100644 paddle/infrt/backends/host/pten_allocator.h create mode 100644 paddle/infrt/backends/host/pten_context.h create mode 100644 paddle/infrt/dialect/pten/infrt_pten_kernel.td create mode 100644 paddle/infrt/kernel/pten/CMakeLists.txt create mode 100644 paddle/infrt/kernel/pten/allocator_kernels.cc rename paddle/infrt/{naive/meta_tensor.cc => kernel/pten/allocator_kernels.h} (64%) create mode 100644 paddle/infrt/kernel/pten/context_kernels.cc rename paddle/infrt/{naive/infershaped/infershaped_kernel_launchers.cc => kernel/pten/context_kernels.h} (50%) create mode 100644 paddle/infrt/kernel/pten/dense_tensor_kernels.cc rename paddle/infrt/kernel/{pten_kernels.cc => pten/dense_tensor_kernels.h} (55%) rename paddle/infrt/{naive => kernel/pten}/infershaped/elementwise_add.h (67%) rename paddle/infrt/{naive => kernel/pten}/infershaped/infershape_launchers_test.cc (56%) rename paddle/infrt/{naive => kernel/pten}/infershaped/infershaped_kernel_launcher.cc (74%) rename paddle/infrt/{naive => kernel/pten}/infershaped/infershaped_kernel_launcher.h (90%) create mode 100644 paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc rename paddle/infrt/{naive => kernel/pten}/infershaped/infershaped_kernel_launchers.h (79%) rename paddle/infrt/{naive => kernel/pten}/infershaped/infershaped_utils.h (95%) create mode 100644 paddle/infrt/kernel/pten/registry.cc rename paddle/infrt/kernel/{pten_kernels.h => pten/registry.h} (100%) delete mode 100644 paddle/infrt/naive/CMakeLists.txt delete mode 100644 paddle/infrt/naive/infershaped/infershaped_registry.cc delete mode 100644 paddle/infrt/naive/infershaped/infershaped_registry.h delete mode 100644 paddle/infrt/naive/meta_tensor.h diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index c8253effe8488..2486c54d5addc 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -82,7 +82,6 @@ add_subdirectory(tensor) add_subdirectory(support) add_subdirectory(external_kernels) add_subdirectory(paddle) -add_subdirectory(naive) add_subdirectory(tests) @@ -99,14 +98,15 @@ set(infrt_mlir_incs trt_ops_inc ) if (INFRT_WITH_PTEN) + set(pten_libs pten) set(infrt_mlir_incs ${infrt_mlir_incs} MLIRinfrt_pten_tensorIncGen MLIRinfrt_pten_baseIncGen ) endif() -cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto infrt_naive) -cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} paddle_framework_proto) +cc_library(infrt SHARED SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto infrt_naive) +cc_library(infrt_static SRCS ${infrt_src} DEPS glog boost ${mlir_libs} ${pten_libs} paddle_framework_proto) add_dependencies(infrt ${infrt_mlir_incs} mlir-headers) add_custom_target(test_infrt_exec DEPENDS ${INFRT_TEST_TARGETS}) diff --git a/paddle/infrt/backends/host/pten_allocator.h b/paddle/infrt/backends/host/pten_allocator.h new file mode 100644 index 0000000000000..172a808afbb5b --- /dev/null +++ b/paddle/infrt/backends/host/pten_allocator.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/allocator.h" + +namespace infrt { +namespace backends { + +class CpuPtenAllocator : public pten::Allocator { + public: + static void deleter(pten::Allocation* ptr) { ::operator delete(ptr); } + + AllocationPtr Allocate(size_t bytes_size) { + return AllocationPtr( + new pten::Allocation(::operator new(bytes_size), + bytes_size, + pten::Place(pten::AllocationType::CPU)), + deleter); + } +}; + +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/backends/host/pten_context.h b/paddle/infrt/backends/host/pten_context.h new file mode 100644 index 0000000000000..1f5efeb272cef --- /dev/null +++ b/paddle/infrt/backends/host/pten_context.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/backends/cpu/cpu_context.h" + +namespace infrt { +namespace backends { + +class CpuPtenContext : public pten::CPUContext { + public: + using Base = pten::CPUContext; + using pten::CPUContext::SetEigenDevice; +}; + +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/dialect/pten/CMakeLists.txt b/paddle/infrt/dialect/pten/CMakeLists.txt index 0fb268952d54f..b4ed5cdc1d82f 100644 --- a/paddle/infrt/dialect/pten/CMakeLists.txt +++ b/paddle/infrt/dialect/pten/CMakeLists.txt @@ -5,6 +5,7 @@ endif() #mlir_tablegen_on(infrt_pten_base DIALECT pten) add_mlir_dialect(infrt_pten_base pten) add_mlir_dialect(infrt_pten_tensor pten_dt) +add_mlir_dialect(infrt_pten_kernel pten_kernel) #mlir_tablegen_on(infrt_pten_tensor) gather_srcs(infrt_src SRCS diff --git a/paddle/infrt/dialect/pten/infrt_pten_kernel.td b/paddle/infrt/dialect/pten/infrt_pten_kernel.td new file mode 100644 index 0000000000000..a3a1609d9918a --- /dev/null +++ b/paddle/infrt/dialect/pten/infrt_pten_kernel.td @@ -0,0 +1,26 @@ +#ifndef PTEN_KERNEL +#define PTEN_KERNEL + +include "paddle/infrt/dialect/pten/infrt_pten_tensor.td" + +def PTEN_KernelDialect : Dialect { + let name = "pten_kernel"; + + let description = [{ + The PTEN Kernel dialect. + }]; + + let cppNamespace = "::infrt::pten"; +} + +// PTEN Kernel related ops. +class PDT_Kernel traits = []> : Op { +} + +def FakeKernelOp : PDT_Kernel<"pten.matmul.host.fp32"> { + let arguments = (ins CPU_Context:$dev_ctx, TensorType:$x, TensorType:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); + let results = (outs TensorType:$output); +} + +#endif + diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.h b/paddle/infrt/dialect/pten/infrt_pten_tensor.h index 24ac2d851fe86..5fe259300d2ae 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_tensor.h +++ b/paddle/infrt/dialect/pten/infrt_pten_tensor.h @@ -33,6 +33,7 @@ #include "paddle/infrt/dialect/pten/infrt_pten_tensorTypes.h.inc" #include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/pten/pten_base.h" // NOLINT #define GET_OP_CLASSES #include "paddle/infrt/dialect/pten/infrt_pten_tensor.h.inc" diff --git a/paddle/infrt/dialect/pten/infrt_pten_tensor.td b/paddle/infrt/dialect/pten/infrt_pten_tensor.td index 040c8ec3d3695..528f0f919680d 100644 --- a/paddle/infrt/dialect/pten/infrt_pten_tensor.td +++ b/paddle/infrt/dialect/pten/infrt_pten_tensor.td @@ -21,84 +21,36 @@ def PTEN_DenseTensorDialect : Dialect { class PDT_Op traits = []> : Op { } -class CreateUninitTensorOp - : PDT_Op<"create_uninit_tensor." # dtype, [NoSideEffect]> { - let summary = "pdt.create_uninit_tensor operation"; - - let description = [{ - An operation that creates an uninitialized tensor. - }]; - - let arguments = (ins I64ArrayAttr:$shape); - let results = (outs TensorType:$output); -} - -class CreateInitedTensorOp - : PDT_Op<"create_inited_tensor." #dtype, [NoSideEffect]> { - let summary = "pdt.create_inited_tensor operation"; - - let description = [{ - An operation that creates an tensor with shape and values assigned. - }]; - - let arguments = (ins I64ArrayAttr:$shape, array_attr:$values); +class CreateDenseTensorOp + : PDT_Op<"create_dense_tensor." # place # "." # dtype # "." # layout, [NoSideEffect]> { + let arguments = (ins CPU_Allocator:$allocator, I64ArrayAttr:$dims, I64ArrayAttr:$lod); let results = (outs TensorType:$output); } -def PrintTensorOp : PDT_Op<"print_tensor"> { - let summary = "pdt.print_tensor operation"; - - let description = [{ - An operation that prints a tensor. - }]; - - let arguments = (ins TensorType:$input); - let results = (outs); - let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; -} - -class FillTensor : - PDT_Op<"fill_tensor." # dtype> { - let summary = "dt.fill_tensor operation"; - - let description = [{ - An operation that fills an input tensor with a values. - }]; - +class FillDenseTensorOp : + PDT_Op<"fill_dense_tensor." # dtype> { let arguments = (ins TensorType:$input, attr_type:$value ); let results = (outs); - - let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; } -class FillTensorWithConstantOp : - PDT_Op<"fill_tensor_with_constant." # dtype> { - let summary = "dt.fill_tensor_with_constant operation"; - - let description = [{ - An operation that fills an input tensor with a single value. - }]; - - let arguments = (ins - TensorType:$input, - AnyAttr:$value - ); - let results = (outs); - - let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; +class CreateCPUAllocatorOp + : PDT_Op<"create_allocator." # "cpu", [NoSideEffect]> { + let arguments = (ins); + let results = (outs CPU_Allocator:$output); } -foreach dtype = ["ui8", "ui16", "ui32", "ui64", "i32", "f32", "f64", "i64"] in { - def PDT_CreateUninitTensorOp_#dtype : CreateUninitTensorOp; - def PDT_FillTensorWithConstantOp_#dtype : FillTensorWithConstantOp; +class CreateCPUContextOp + : PDT_Op<"create_context." # "cpu", [NoSideEffect]> { + let arguments = (ins); + let results = (outs CPU_Context:$output); } -def PDT_FillTensor_f32: FillTensor<"f32", F32ArrayAttr>; -def PDT_FillTensor_i32: FillTensor<"i32", I32ArrayAttr>; -def PDT_CreateInitedTensorOp_f32 : CreateInitedTensorOp<"f32", F32ArrayAttr>; -def PDT_CreateInitedTensorOp_i32 : CreateInitedTensorOp<"i32", I32ArrayAttr>; +def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nchw">; +def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; +def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp; +def PDT_CreateContextOp_cpu : CreateCPUContextOp; #endif diff --git a/paddle/infrt/dialect/pten/pten_base.cc b/paddle/infrt/dialect/pten/pten_base.cc index ac23d44248982..ba87787dd7f7c 100644 --- a/paddle/infrt/dialect/pten/pten_base.cc +++ b/paddle/infrt/dialect/pten/pten_base.cc @@ -29,7 +29,23 @@ namespace pten { void PTENDialect::printType(::mlir::Type type, mlir::DialectAsmPrinter& os) const { - Dialect::printType(type, os); + if (type.isa()) { + os << "CPU_Allocator"; + return; + } + if (type.isa()) { + os << "GPU_Allocator"; + return; + } + if (type.isa()) { + os << "CPU_Context"; + return; + } + if (type.isa()) { + os << "GPU_Context"; + return; + } + llvm_unreachable("unexpected 'allocator/context' type kind"); } void PTENDialect::initialize() { @@ -46,14 +62,16 @@ void PTENDialect::initialize() { mlir::Type PTENDialect::parseType(mlir::DialectAsmParser& parser) const { llvm::StringRef keyword; if (parser.parseKeyword(&keyword)) return mlir::Type(); - if (keyword == "allocator_CPU") { + if (keyword == "CPU_allocator") { return CPUAllocatorType::get(parser.getContext()); - } else if (keyword == "allocator_GPU") { + } else if (keyword == "GPU_allocator") { return GPUAllocatorType::get(parser.getContext()); - } else if (keyword == "context_CPU") { + } else if (keyword == "CPU_context") { return CPUContextType::get(parser.getContext()); - } else if (keyword == "context_GPU") { + } else if (keyword == "GPU_context") { return GPUContextType::get(parser.getContext()); + } else { + llvm_unreachable("unexpected 'allocator/context' type kind"); } return mlir::Type(); diff --git a/paddle/infrt/host_context/kernel_registry.h b/paddle/infrt/host_context/kernel_registry.h index d65969999f6ed..a813f690efb0b 100644 --- a/paddle/infrt/host_context/kernel_registry.h +++ b/paddle/infrt/host_context/kernel_registry.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -23,7 +24,7 @@ namespace host_context { class KernelFrame; -using KernelImplementation = void (*)(KernelFrame *frame); +using KernelImplementation = std::function; /** * Hold the kernels registered in the system. diff --git a/paddle/infrt/host_context/kernel_registry_test.cc b/paddle/infrt/host_context/kernel_registry_test.cc index 7fca56343041c..fd2aecb3e6c1e 100644 --- a/paddle/infrt/host_context/kernel_registry_test.cc +++ b/paddle/infrt/host_context/kernel_registry_test.cc @@ -28,7 +28,7 @@ TEST(KernelRegistry, basic) { std::string key = "infrt.test.add.i32"; registry.AddKernel(key, INFRT_KERNEL(add_i32)); - auto* kernel_impl = registry.GetKernel(key); + const auto& kernel_impl = registry.GetKernel(key); ASSERT_TRUE(kernel_impl); ValueRef a(1); diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index b0d70af5ef9f2..62c907bc9159f 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -28,6 +28,9 @@ #include "paddle/infrt/kernel/tensor_kernels.h" #include "paddle/infrt/kernel/tensor_shape_kernels.h" #include "paddle/infrt/kernel/test_kernels.h" +#ifdef INFRT_WITH_PTEN +#include "paddle/infrt/kernel/pten/registry.h" +#endif static llvm::cl::list cl_shared_libs( // NOLINT "shared_libs", @@ -53,6 +56,9 @@ int main(int argc, char** argv) { kernel::RegisterTensorShapeKernels(®istry); kernel::RegisterTensorKernels(®istry); kernel::RegisterControlFlowKernels(®istry); +#ifdef INFRT_WITH_PTEN + kernel::RegisterPtenKernels(®istry); +#endif // load extra shared library for (const auto& lib_path : cl_shared_libs) { diff --git a/paddle/infrt/host_context/value.cc b/paddle/infrt/host_context/value.cc index 1c5a577092636..e8b904efb74a1 100644 --- a/paddle/infrt/host_context/value.cc +++ b/paddle/infrt/host_context/value.cc @@ -24,7 +24,13 @@ ValueRef::ValueRef(int64_t val) : Shared(new Value(val)) {} ValueRef::ValueRef(float val) : Shared(new Value(val)) {} ValueRef::ValueRef(double val) : Shared(new Value(val)) {} ValueRef::ValueRef(bool val) : Shared(new Value(val)) {} -ValueRef::ValueRef(naive::MetaTensor&& val) +ValueRef::ValueRef(backends::CpuPtenContext&& val) + : Shared(new Value(std::move(val))) {} +ValueRef::ValueRef(::pten::CPUContext&& val) + : Shared(new Value(std::move(val))) {} +ValueRef::ValueRef(::pten::DenseTensor&& val) + : Shared(new Value(std::move(val))) {} +ValueRef::ValueRef(::pten::MetaTensor&& val) : Shared(new Value(std::move(val))) {} const char* Value::type_info() const { return __type_info__; } @@ -36,31 +42,31 @@ void CopyTo(const Value& from, Value* to) { [&](auto&& arg) { using T = std::decay_t; if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else if (std::is_same>::value) - to->data = arg; + to->data = reinterpret_cast const&>(arg); else if (std::is_same>::value) - to->data = arg; + to->data = reinterpret_cast const&>(arg); else if (std::is_same::value) - to->data = arg; + to->data = reinterpret_cast(arg); else LOG(FATAL) << "Not supported Value copy: " << typeid(T).name(); }, diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 904e51f92838d..f623e141512ce 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -23,15 +23,19 @@ #include "paddle/infrt/common/object.h" #include "paddle/infrt/common/shared.h" #include "paddle/infrt/host_context/function.h" -#include "paddle/infrt/naive/meta_tensor.h" #include "paddle/infrt/support/variant.h" #include "paddle/infrt/tensor/dense_host_tensor.h" #include "paddle/infrt/tensor/dense_tensor_view.h" #include "paddle/infrt/tensor/tensor_map.h" #include "paddle/infrt/tensor/tensor_shape.h" -// Disabled temporarily for failed compile, will enable latter. -// #include "paddle/pten/backends/cpu/cpu_context.h" -// #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/meta_tensor.h" + +#ifdef INFRT_WITH_PTEN +#include "paddle/infrt/backends/host/pten_allocator.h" +#include "paddle/infrt/backends/host/pten_context.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/dense_tensor.h" +#endif namespace infrt { namespace host_context { @@ -44,14 +48,20 @@ using ValueVariantType = Variant, std::vector, std::vector, @@ -84,7 +94,13 @@ class Value : public common::Object { explicit Value(tensor::TensorShape&& x) : data(std::move(x)) {} explicit Value(tensor::DenseHostTensor&& x) : data(std::move(x)) {} explicit Value(MlirFunctionExecutable* x) : data(x) {} - explicit Value(naive::MetaTensor&& x) : data(std::move(x)) {} +#ifdef INFRT_WITH_PTEN + explicit Value(backends::CpuPtenContext&& x) : data(std::move(x)) {} + explicit Value(::pten::CPUContext&& x) : data(std::move(x)) {} + explicit Value(::pten::DenseTensor&& x) : data(std::move(x)) {} + explicit Value(::pten::MetaTensor&& x) : data(std::move(x)) {} + explicit Value(backends::CpuPtenAllocator&& x) : data(std::move(x)) {} +#endif template const T& get() const { @@ -142,7 +158,10 @@ class ValueRef : common::Shared { explicit ValueRef(float val); explicit ValueRef(double val); explicit ValueRef(bool val); - explicit ValueRef(naive::MetaTensor&& val); + explicit ValueRef(::pten::MetaTensor&& val); + explicit ValueRef(backends::CpuPtenContext&& x); + explicit ValueRef(::pten::CPUContext&& x); + explicit ValueRef(::pten::DenseTensor&& x); using common::Shared::get; using common::Shared::Reset; diff --git a/paddle/infrt/kernel/CMakeLists.txt b/paddle/infrt/kernel/CMakeLists.txt index b7ef5691e4760..402665119ac2d 100644 --- a/paddle/infrt/kernel/CMakeLists.txt +++ b/paddle/infrt/kernel/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(pten) + core_gather_headers() gather_srcs(infrt_src SRCS diff --git a/paddle/infrt/kernel/pten/CMakeLists.txt b/paddle/infrt/kernel/pten/CMakeLists.txt new file mode 100644 index 0000000000000..65c10b0b15f8d --- /dev/null +++ b/paddle/infrt/kernel/pten/CMakeLists.txt @@ -0,0 +1,19 @@ +if (NOT INFRT_WITH_PTEN) + return() +endif() + +core_gather_headers() + +gather_srcs(infrt_src SRCS + registry.cc + dense_tensor_kernels.cc + context_kernels.cc + allocator_kernels.cc +) + +cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc + infershaped/infershaped_kernel_launchers.cc + ) + +cc_test_tiny(test_infrt_infershape_launchers SRCS +infershaped/infershape_launchers_test.cc DEPS infrt) diff --git a/paddle/infrt/kernel/pten/allocator_kernels.cc b/paddle/infrt/kernel/pten/allocator_kernels.cc new file mode 100644 index 0000000000000..d3ecbed15da96 --- /dev/null +++ b/paddle/infrt/kernel/pten/allocator_kernels.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/pten/allocator_kernels.h" + +namespace infrt { +namespace kernel { +namespace pten { + +backends::CpuPtenAllocator CreateCpuAllocator() { return {}; } + +} // namespace pten +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/naive/meta_tensor.cc b/paddle/infrt/kernel/pten/allocator_kernels.h similarity index 64% rename from paddle/infrt/naive/meta_tensor.cc rename to paddle/infrt/kernel/pten/allocator_kernels.h index 2f7ee3a69e290..33127711193a2 100644 --- a/paddle/infrt/naive/meta_tensor.cc +++ b/paddle/infrt/kernel/pten/allocator_kernels.h @@ -12,20 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/naive/meta_tensor.h" +#pragma once -#include "paddle/infrt/tensor/dense_host_tensor.h" -#include "paddle/infrt/tensor/tensor_shape.h" +#include "paddle/infrt/backends/host/pten_allocator.h" +#include "paddle/pten/core/dense_tensor.h" namespace infrt { -namespace naive { +namespace kernel { +namespace pten { -const tensor::TensorShape& MetaTensor::shape() const { - return mutable_tensor_->shape(); -} -tensor::TensorShape* MetaTensor::mutable_shape() { - return mutable_tensor_->mutable_shape(); -} +backends::CpuPtenAllocator CreateCpuAllocator(); -} // namespace naive +} // namespace pten +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/context_kernels.cc b/paddle/infrt/kernel/pten/context_kernels.cc new file mode 100644 index 0000000000000..0c5e53212113b --- /dev/null +++ b/paddle/infrt/kernel/pten/context_kernels.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/pten/context_kernels.h" + +namespace infrt { +namespace kernel { +namespace pten { + +backends::CpuPtenContext CreateCpuContext() { return {}; } + +} // namespace pten +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_kernel_launchers.cc b/paddle/infrt/kernel/pten/context_kernels.h similarity index 50% rename from paddle/infrt/naive/infershaped/infershaped_kernel_launchers.cc rename to paddle/infrt/kernel/pten/context_kernels.h index e570b3521b795..14a151d9d1d8e 100644 --- a/paddle/infrt/naive/infershaped/infershaped_kernel_launchers.cc +++ b/paddle/infrt/kernel/pten/context_kernels.h @@ -12,23 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launchers.h" -#include "paddle/infrt/naive/infershaped/elementwise_add.h" -#include "paddle/infrt/naive/infershaped/infershaped_registry.h" +#pragma once -namespace infrt { -namespace naive { +#include "paddle/infrt/backends/host/pten_context.h" +#include "paddle/pten/core/dense_tensor.h" -using ElementwiseAddLauncher = - KernelLauncher; +namespace infrt { +namespace kernel { +namespace pten { -void RegisterInferShapeLaunchers(InferShapedKernelRegistry* registry) { - registry->AddKernel("elementwise_add", - INFERSHAPED_KERNEL_CREATOR(ElementwiseAddLauncher)); -} +backends::CpuPtenContext CreateCpuContext(); -} // namespace naive +} // namespace pten +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/dense_tensor_kernels.cc b/paddle/infrt/kernel/pten/dense_tensor_kernels.cc new file mode 100644 index 0000000000000..2db5f4a3c1179 --- /dev/null +++ b/paddle/infrt/kernel/pten/dense_tensor_kernels.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h" + +namespace infrt { +namespace kernel { +namespace pten { + +::pten::DenseTensor CreateDenseTensorCpuF32Nchw( + backends::CpuPtenAllocator* allocator, + host_context::Attribute> dims, + host_context::Attribute> lod) { + return ::pten::DenseTensor( + allocator, + ::pten::DenseTensorMeta(::pten::DataType::FLOAT32, + ::pten::framework::make_ddim(dims.get()), + ::pten::DataLayout::NCHW, + {})); +} + +void FillDenseTensorF32(::pten::DenseTensor* dense_tensor, + host_context::Attribute> values) {} + +} // namespace pten +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/pten_kernels.cc b/paddle/infrt/kernel/pten/dense_tensor_kernels.h similarity index 55% rename from paddle/infrt/kernel/pten_kernels.cc rename to paddle/infrt/kernel/pten/dense_tensor_kernels.h index 62e2db659ad42..f60525707cd77 100644 --- a/paddle/infrt/kernel/pten_kernels.cc +++ b/paddle/infrt/kernel/pten/dense_tensor_kernels.h @@ -12,29 +12,24 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/kernel/pten_kernels.h" +#pragma once -#include -#include - -#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/backends/host/pten_allocator.h" #include "paddle/infrt/host_context/kernel_utils.h" - -// Disable temporarily. -// #include "paddle/pten/backends/cpu/cpu_context.h" -// #include "paddle/pten/kernels/math_kernel.h" - -using infrt::host_context::Attribute; +#include "paddle/pten/core/dense_tensor.h" namespace infrt { namespace kernel { +namespace pten { + +::pten::DenseTensor CreateDenseTensorCpuF32Nchw( + backends::CpuPtenAllocator* allocator, + host_context::Attribute> dims, + host_context::Attribute> lod); -void RegisterPtenKernels(host_context::KernelRegistry* registry) { - registry->AddKernel("pd_cpu.add.float32", - INFRT_KERNEL(pten::AddKernel)); - registry->AddKernel("pd_cpu.add.int32", - INFRT_KERNEL(pten::AddKernel)); -} +void FillDenseTensorF32(::pten::DenseTensor* dense_tensor, + host_context::Attribute> values); +} // namespace pten } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/naive/infershaped/elementwise_add.h b/paddle/infrt/kernel/pten/infershaped/elementwise_add.h similarity index 67% rename from paddle/infrt/naive/infershaped/elementwise_add.h rename to paddle/infrt/kernel/pten/infershaped/elementwise_add.h index ee044e38da03d..1d9d0106da539 100644 --- a/paddle/infrt/naive/infershaped/elementwise_add.h +++ b/paddle/infrt/kernel/pten/infershaped/elementwise_add.h @@ -16,27 +16,23 @@ #include #include "paddle/infrt/host_context/kernel_utils.h" -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h" -#include "paddle/infrt/naive/infershaped/infershaped_utils.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h" // This file contains a example of the infershape ElementwiseAdd kernel. // Some of the following code should be generated from PTEN by script. namespace infrt { -namespace naive { +namespace kernel { -static void ElementwiseAddInferShape(const MetaTensor& a, - const MetaTensor& b, - MetaTensor* c) { - CHECK(a.shape() == b.shape()) - << "ElementwiseAdd, but shapes of a b are not match"; - *c->mutable_shape() = a.shape(); -} +static void ElementwiseAddInferShape(const ::pten::MetaTensor& a, + const ::pten::MetaTensor& b, + ::pten::MetaTensor* c) {} -static void ElementwiseAdd(tensor::DenseHostTensor* /*Context*/, - const tensor::DenseHostTensor& a, - const tensor::DenseHostTensor& b, - tensor::DenseHostTensor* c) {} +static void ElementwiseAdd(const ::pten::CPUContext& /*Context*/, + const ::pten::DenseTensor& a, + const ::pten::DenseTensor& b, + ::pten::DenseTensor* c) {} template +void KernelLauncherFunc( + KernelLauncher launcher, + host_context::KernelFrame* frame) { + launcher.Invoke(frame); +} + +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc similarity index 56% rename from paddle/infrt/naive/infershaped/infershape_launchers_test.cc rename to paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc index ba6fdbdd5783f..64b99110d94c7 100644 --- a/paddle/infrt/naive/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/pten/infershaped/infershape_launchers_test.cc @@ -14,19 +14,17 @@ #include -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h" -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launchers.h" -#include "paddle/infrt/naive/infershaped/infershaped_registry.h" -#include "paddle/infrt/naive/infershaped/infershaped_utils.h" -#include "paddle/infrt/tensor/dense_host_tensor.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_utils.h" namespace infrt { -namespace naive { +namespace kernel { namespace { -static void ElementwiseAddTest(const tensor::DenseHostTensor& a, - const tensor::DenseHostTensor& b, - tensor::DenseHostTensor* c); +static void ElementwiseAddTest(const ::pten::DenseTensor& a, + const ::pten::DenseTensor& b, + ::pten::DenseTensor* c); } TEST(utils, registry) { @@ -35,26 +33,24 @@ TEST(utils, registry) { CHECK_EQ(count, 2U); } -TEST(ElementwiseAdd, registry) { - InferShapedKernelRegistry registry; +TEST(ElementwiseAdd, launcher_registry) { + host_context::KernelRegistry registry; RegisterInferShapeLaunchers(®istry); ASSERT_EQ(registry.size(), 1UL); auto creator = registry.GetKernel("elementwise_add"); - auto infershape_launcher_handle = creator(); - // fake some tensors - tensor::DenseHostTensor a({2, 8}, GetDType()); - tensor::DenseHostTensor b({2, 8}, GetDType()); - tensor::DenseHostTensor c({2, 8}, GetDType()); + ::pten::CPUContext ctx{}; + ::pten::DenseTensor a{}; + ::pten::DenseTensor b{}; + ::pten::DenseTensor c{}; host_context::KernelFrameBuilder kernel_frame_builder; - kernel_frame_builder.AddArgument(new host_context::Value(0)); + kernel_frame_builder.AddArgument(new host_context::Value(std::move(ctx))); kernel_frame_builder.AddArgument(new host_context::Value(std::move(a))); kernel_frame_builder.AddArgument(new host_context::Value(std::move(b))); kernel_frame_builder.SetResults({new host_context::Value(std::move(c))}); - - infershape_launcher_handle->Invoke(&kernel_frame_builder); + creator(&kernel_frame_builder); } -} // namespace naive +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc similarity index 74% rename from paddle/infrt/naive/infershaped/infershaped_kernel_launcher.cc rename to paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc index 6a2c4a51ecdb2..80f8bae4018cb 100644 --- a/paddle/infrt/naive/infershaped/infershaped_kernel_launcher.cc +++ b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.cc @@ -12,18 +12,19 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h" +#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h" namespace infrt { -namespace naive { +namespace kernel { void InferShapedKernelLauncher::CreateKernelFrameForInferShape( host_context::KernelFrame* frame) { for (host_context::Value* value : frame->GetValues(1, frame->GetNumElements() - 1)) { // TODO(Superjomn) To extend this. - if (value->is_type()) { - values.emplace_back(MetaTensor{&value->get()}); + if (value->is_type<::pten::DenseTensor>()) { + values.emplace_back( + ::pten::MetaTensor{&value->get<::pten::DenseTensor>()}); infershape_kernel_frame_builder.AddArgument(values.back().get()); } else { infershape_kernel_frame_builder.AddArgument(value); @@ -35,8 +36,9 @@ void InferShapedKernelLauncher::BuildInferShapeCache( const uint16_t num_inputs) { tensor_shape_cache.resize(num_inputs); for (uint16_t i = 0; i < num_inputs; i++) { - tensor_shape_cache[i] = - infershape_kernel_frame_builder.GetArgAt(i)->get().shape(); + tensor_shape_cache[i] = infershape_kernel_frame_builder.GetArgAt(i) + ->get<::pten::MetaTensor>() + .dims(); } } @@ -49,10 +51,11 @@ bool InferShapedKernelLauncher::IsShapeChanged( for (uint16_t i = 0; i < num_inputs && !changed; i++) { changed = changed || (tensor_shape_cache[i] != - infershape_kernel_frame_builder.GetArgAt(i).shape()); + infershape_kernel_frame_builder.GetArgAt<::pten::MetaTensor>(i) + .dims()); } return changed; } -} // namespace naive +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h similarity index 90% rename from paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h rename to paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h index 890a779ed2403..9348bf8d05008 100644 --- a/paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h +++ b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launcher.h @@ -17,11 +17,9 @@ #include "paddle/infrt/host_context/kernel_frame.h" #include "paddle/infrt/host_context/value.h" -#include "paddle/infrt/naive/meta_tensor.h" -#include "paddle/infrt/tensor/dense_host_tensor.h" namespace infrt { -namespace naive { +namespace kernel { struct InferShapedKernelLauncher { virtual void Invoke(host_context::KernelFrame* frame) = 0; @@ -46,9 +44,9 @@ struct InferShapedKernelLauncher { // values to hold the TensorMeta. llvm::SmallVector values; - llvm::SmallVector tensor_shape_cache; + llvm::SmallVector<::pten::DDim, 3> tensor_shape_cache; host_context::KernelFrameBuilder infershape_kernel_frame_builder; }; -} // namespace naive +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc new file mode 100644 index 0000000000000..23d4f919af057 --- /dev/null +++ b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h" +#include "paddle/infrt/kernel/pten/infershaped/elementwise_add.h" + +namespace infrt { +namespace kernel { + +void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry) { + registry->AddKernel( + "elementwise_add", + std::bind(&KernelLauncherFunc, + KernelLauncher(), + std::placeholders::_1)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_kernel_launchers.h b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h similarity index 79% rename from paddle/infrt/naive/infershaped/infershaped_kernel_launchers.h rename to paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h index 3e83b690bb8df..ba25f06876cca 100644 --- a/paddle/infrt/naive/infershaped/infershaped_kernel_launchers.h +++ b/paddle/infrt/kernel/pten/infershaped/infershaped_kernel_launchers.h @@ -14,12 +14,12 @@ #pragma once -namespace infrt { -namespace naive { +#include "paddle/infrt/host_context/kernel_registry.h" -struct InferShapedKernelRegistry; +namespace infrt { +namespace kernel { -void RegisterInferShapeLaunchers(InferShapedKernelRegistry* registry); +void RegisterInferShapeLaunchers(host_context::KernelRegistry* registry); -} // namespace naive +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_utils.h b/paddle/infrt/kernel/pten/infershaped/infershaped_utils.h similarity index 95% rename from paddle/infrt/naive/infershaped/infershaped_utils.h rename to paddle/infrt/kernel/pten/infershaped/infershaped_utils.h index 8155d87231a8f..aa5e900b8b26a 100644 --- a/paddle/infrt/naive/infershaped/infershaped_utils.h +++ b/paddle/infrt/kernel/pten/infershaped/infershaped_utils.h @@ -18,10 +18,10 @@ #include "paddle/infrt/tensor/dense_host_tensor.h" namespace infrt { -namespace naive { +namespace kernel { namespace infershaped { -using KeyType = const tensor::DenseHostTensor&; +using KeyType = const ::pten::DenseTensor&; using CountType = uint8_t; constexpr CountType value(std::true_type) { return 1; } @@ -73,5 +73,5 @@ struct InferShapeHelper { static constexpr int count = infershaped::count(); }; -} // namespace naive +} // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/pten/registry.cc b/paddle/infrt/kernel/pten/registry.cc new file mode 100644 index 0000000000000..888992c47d968 --- /dev/null +++ b/paddle/infrt/kernel/pten/registry.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/kernel/pten/registry.h" + +#include +#include + +#include "paddle/infrt/host_context/kernel_registry.h" +#include "paddle/infrt/host_context/kernel_utils.h" +#include "paddle/infrt/kernel/pten/allocator_kernels.h" +#include "paddle/infrt/kernel/pten/context_kernels.h" +#include "paddle/infrt/kernel/pten/dense_tensor_kernels.h" +#include "paddle/infrt/kernel/pten/infershaped/elementwise_add.h" +#include "paddle/pten/include/infermeta.h" +#include "paddle/pten/include/kernels.h" +#include "paddle/pten/kernels/matmul_kernel.h" + +using infrt::host_context::Attribute; + +namespace infrt { +namespace kernel { + +void RegisterPtenKernels(host_context::KernelRegistry* registry) { + registry->AddKernel("pten_dt.create_allocator.cpu", + INFRT_KERNEL(infrt::kernel::pten::CreateCpuAllocator)); + registry->AddKernel("pten_dt.create_context.cpu", + INFRT_KERNEL(infrt::kernel::pten::CreateCpuContext)); + registry->AddKernel( + "pten_dt.create_dense_tensor.cpu.f32.nchw", + INFRT_KERNEL(infrt::kernel::pten::CreateDenseTensorCpuF32Nchw)); + registry->AddKernel("pten_dt.fill_dense_tensor.f32", + INFRT_KERNEL(infrt::kernel::pten::FillDenseTensorF32)); + registry->AddKernel( + "pten.matmul.host.fp32", + std::bind(&kernel::KernelLauncherFunc< + decltype(&::pten::MatmulKernel), + &::pten::MatmulKernel, + decltype(&::pten::MatmulInferMeta), + &::pten::MatmulInferMeta>, + kernel::KernelLauncher< + decltype(&::pten::MatmulKernel), + &::pten::MatmulKernel, + decltype(&::pten::MatmulInferMeta), + &::pten::MatmulInferMeta>(), + std::placeholders::_1)); +} + +} // namespace kernel +} // namespace infrt diff --git a/paddle/infrt/kernel/pten_kernels.h b/paddle/infrt/kernel/pten/registry.h similarity index 100% rename from paddle/infrt/kernel/pten_kernels.h rename to paddle/infrt/kernel/pten/registry.h diff --git a/paddle/infrt/naive/CMakeLists.txt b/paddle/infrt/naive/CMakeLists.txt deleted file mode 100644 index c90c6e7ba7b88..0000000000000 --- a/paddle/infrt/naive/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -cc_library(infrt_naive SRCS meta_tensor.cc - infershaped/infershaped_kernel_launcher.cc - infershaped/infershaped_registry.cc - infershaped/infershaped_kernel_launchers.cc - ) - -cc_test_tiny(test_infrt_infershape_launchers SRCS -infershaped/infershape_launchers_test.cc DEPS infrt) diff --git a/paddle/infrt/naive/infershaped/infershaped_registry.cc b/paddle/infrt/naive/infershaped/infershaped_registry.cc deleted file mode 100644 index 94218a9a6f6a6..0000000000000 --- a/paddle/infrt/naive/infershaped/infershaped_registry.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/infrt/naive/infershaped/infershaped_registry.h" - -#include - -#include "paddle/infrt/naive/infershaped/infershaped_kernel_launcher.h" - -namespace infrt { -namespace naive { - -struct InferShapedKernelRegistry::Impl { - std::unordered_map data; -}; - -InferShapedKernelRegistry::InferShapedKernelRegistry() - : impl_(std::make_unique()) {} - -void InferShapedKernelRegistry::AddKernel( - const std::string& key, - InferShapedKernelRegistry::InferShapeLauncherCreator&& creator) { - CHECK(!impl_->data.count(key)) << "Item called " << key << " duplicates"; - impl_->data.emplace(key, std::move(creator)); -} - -const InferShapedKernelRegistry::InferShapeLauncherCreator& -InferShapedKernelRegistry::GetKernel(const std::string& key) const { - auto it = impl_->data.find(key); - CHECK(it != impl_->data.end()) << "No item called " << key << " exists"; - return it->second; -} - -size_t InferShapedKernelRegistry::size() const { return impl_->data.size(); } - -InferShapedKernelRegistry* GetInferShapeRegistry() { - static auto registry = std::make_unique(); - return registry.get(); -} - -InferShapedKernelRegistry::~InferShapedKernelRegistry() {} - -} // namespace naive -} // namespace infrt diff --git a/paddle/infrt/naive/infershaped/infershaped_registry.h b/paddle/infrt/naive/infershaped/infershaped_registry.h deleted file mode 100644 index e0e56a148fa3d..0000000000000 --- a/paddle/infrt/naive/infershaped/infershaped_registry.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -namespace infrt { -namespace naive { - -struct InferShapedKernelLauncher; - -class InferShapedKernelRegistry { - public: - using InferShapeLauncherHandle = std::unique_ptr; - using InferShapeLauncherCreator = std::function; - - InferShapedKernelRegistry(); - - void AddKernel(const std::string& key, InferShapeLauncherCreator&& creator); - - const InferShapeLauncherCreator& GetKernel(const std::string& key) const; - - size_t size() const; - - ~InferShapedKernelRegistry(); - - private: - struct Impl; - - std::unique_ptr impl_; -}; - -//! The global infershape registry. -InferShapedKernelRegistry* GetInferShapeRegistry(); - -} // namespace naive -} // namespace infrt - -#define INFERSHAPED_KERNEL_CREATOR(infershape_launcher_class_) \ - []() \ - -> ::infrt::naive::InferShapedKernelRegistry::InferShapeLauncherHandle { \ - return std::make_unique(); \ - } diff --git a/paddle/infrt/naive/meta_tensor.h b/paddle/infrt/naive/meta_tensor.h deleted file mode 100644 index 4b62f3021a3a6..0000000000000 --- a/paddle/infrt/naive/meta_tensor.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// A naive implementation of MetaTensor -#pragma once -#include "paddle/infrt/common/common.h" - -namespace infrt { -namespace tensor { -struct DenseHostTensor; -struct TensorShape; -} // namespace tensor - -namespace naive { - -class MetaTensor { - public: - MetaTensor() = default; - explicit MetaTensor(tensor::DenseHostTensor* tensor) - : mutable_tensor_(tensor) {} - explicit MetaTensor(const tensor::DenseHostTensor* tensor) - : mutable_tensor_(&Reference(tensor)) {} - explicit MetaTensor(MetaTensor&& other) - : mutable_tensor_(other.mutable_tensor_) {} - explicit MetaTensor(const MetaTensor& other) - : mutable_tensor_(other.mutable_tensor_) {} - - const tensor::TensorShape& shape() const; - tensor::TensorShape* mutable_shape(); - - private: - tensor::DenseHostTensor* mutable_tensor_{}; -}; - -} // namespace naive -} // namespace infrt diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir index 109fa2d6fa741..88f5b289fd9f8 100644 --- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir @@ -1,10 +1,11 @@ // RUN: infrtopt %s | FileCheck %s -// CHECK-LABEL: basic_tensor +// CHECK-LABEL: @basic_tensor func @basic_tensor() { - %a = "pten_dt.create_uninit_tensor.f32" () { shape=[12:i64, 23:i64] } : () -> !infrt.tensor - %b = "pten_dt.create_inited_tensor.f32" () { shape=[2:i64, 2:i64], values=[0.1:f32, 0.2:f32, 0.3:f32, 0.4:f32] } : () -> !infrt.tensor - "pten_dt.fill_tensor_with_constant.f32" (%a) { value=0.1:f32 } : (!infrt.tensor) -> () + %a = "pten_dt.create_allocator.cpu" (): () -> !pten.CPU_allocator + %b = "pten_dt.create_context.cpu" (): () -> !pten.CPU_context + %c = "pten_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!pten.CPU_allocator) -> (!infrt.tensor) + // "pten_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!infrt.tensor) -> () infrt.return } diff --git a/paddle/pten/backends/cpu/cpu_context.cc b/paddle/pten/backends/cpu/cpu_context.cc index 4029c286a5b28..5eb89c2dc658d 100644 --- a/paddle/pten/backends/cpu/cpu_context.cc +++ b/paddle/pten/backends/cpu/cpu_context.cc @@ -58,6 +58,10 @@ CPUContext::CPUContext(const Place& place) CPUContext::~CPUContext() = default; +CPUContext::CPUContext(CPUContext&&) = default; + +CPUContext& CPUContext::operator=(CPUContext&&) = default; + void CPUContext::Init() { impl_->Init(); } Eigen::DefaultDevice* CPUContext::eigen_device() const { diff --git a/paddle/pten/backends/cpu/cpu_context.h b/paddle/pten/backends/cpu/cpu_context.h index dca87a786b961..1e4109d3eeb7f 100644 --- a/paddle/pten/backends/cpu/cpu_context.h +++ b/paddle/pten/backends/cpu/cpu_context.h @@ -27,6 +27,8 @@ namespace pten { class CPUContext : public DeviceContext { public: CPUContext(); + CPUContext(CPUContext&&); + CPUContext& operator=(CPUContext&&); explicit CPUContext(const Place&); virtual ~CPUContext(); Eigen::DefaultDevice* eigen_device() const; diff --git a/paddle/pten/core/device_context.cc b/paddle/pten/core/device_context.cc index 70d71b5c767ea..bc9d7fc7d29b2 100644 --- a/paddle/pten/core/device_context.cc +++ b/paddle/pten/core/device_context.cc @@ -149,6 +149,8 @@ DeviceContext::DeviceContext(DeviceContext&& other) { impl_ = std::move(other.impl_); } +DeviceContext& DeviceContext::operator=(DeviceContext&&) = default; + DeviceContext::~DeviceContext() = default; void DeviceContext::SetAllocator(const Allocator* allocator) { diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h index d627f19b55dbc..05753b531ff08 100644 --- a/paddle/pten/core/device_context.h +++ b/paddle/pten/core/device_context.h @@ -49,6 +49,11 @@ class DeviceContext { */ DeviceContext(DeviceContext&&); + /** + * @brief Move assign operator. + */ + DeviceContext& operator=(DeviceContext&&); + /** * @brief Default destruct. */ From 1f7f8561fcece3050166e07719228fa5dd8c2256 Mon Sep 17 00:00:00 2001 From: QingshuChen Date: Thu, 17 Feb 2022 15:27:23 +0800 Subject: [PATCH 13/19] update kunlun label_smooth unitest (#39611) * update kunlun label_smooth unitest *test=kunlun * minor *test=kunlun --- .../unittests/xpu/test_label_smooth_op_xpu.py | 79 ++++++++++++------- tools/check_file_diff_approvals.sh | 4 +- 2 files changed, 52 insertions(+), 31 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py index 5a827c1beb291..afe1662ce5cfc 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_label_smooth_op_xpu.py @@ -20,45 +20,66 @@ import sys sys.path.append("..") from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() -class TestLabelSmoothOp(XPUOpTest): - def config(self): - self.op_type = "label_smooth" - self.epsilon = 0.1 - self.use_xpu = True - batch_size, self.label_dim = 10, 12 - self.label = np.zeros((batch_size, self.label_dim)).astype("float32") - nonzero_index = np.random.randint(self.label_dim, size=(batch_size)) - self.label[np.arange(batch_size), nonzero_index] = 1 +class XPUTestLabelSmoothOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'label_smooth' + self.use_dynamic_create_class = True - def setUp(self): - self.config() - smoothed_label = (1 - self.epsilon - ) * self.label + self.epsilon / self.label_dim - self.inputs = {'X': self.label} - self.attrs = {'epsilon': self.epsilon} - self.outputs = {'Out': smoothed_label} + def dynamic_create_class(self): + base_class = self.TestLabelSmoothOp + classes = [] + batch_sizes = [1, 5, 1024] + label_dims = [1, 7, 12] + for bs in batch_sizes: + for label_dim in label_dims: + class_name = 'XPUTestLabelSmooth_' + \ + str(bs) + "_" + str(label_dim) + attr_dict = {'batch_size': bs, 'label_dim': label_dim} + classes.append([class_name, attr_dict]) + classes.append(['XPUTestLabelSmooth_3d', {'is_3d': True}]) + return base_class, classes - def test_check_output(self): - if not paddle.is_compiled_with_xpu(): - return - self.check_output_with_place(paddle.XPUPlace(0), atol=1e-6) + class TestLabelSmoothOp(XPUOpTest): + def setUp(self): + self.op_type = "label_smooth" + self.epsilon = 0.1 + self.use_xpu = True + if not hasattr(self, 'batch_size'): + self.batch_size = 10 + self.label_dim = 12 + self.label = np.zeros( + (self.batch_size, self.label_dim)).astype("float32") + nonzero_index = np.random.randint( + self.label_dim, size=(self.batch_size)) + self.label[np.arange(self.batch_size), nonzero_index] = 1 + smoothed_label = (1 - self.epsilon + ) * self.label + self.epsilon / self.label_dim + self.inputs = {'X': self.label} + self.attrs = {'epsilon': self.epsilon} + self.outputs = {'Out': smoothed_label} + if hasattr(self, 'is_3d') and self.is_3d: + self.inputs['X'] = self.inputs['X'].reshape( + [2, -1, self.inputs['X'].shape[-1]]) + self.outputs['Out'] = self.outputs['Out'].reshape(self.inputs[ + 'X'].shape) - def test_check_grad(self): - return + def test_check_output(self): + if not paddle.is_compiled_with_xpu(): + return + self.check_output_with_place(paddle.XPUPlace(0), atol=1e-6) + def test_check_grad(self): + return -class TestLabelSmoothOp3D(TestLabelSmoothOp): - def setUp(self): - super(TestLabelSmoothOp3D, self).setUp() - self.inputs['X'] = self.inputs['X'].reshape( - [2, -1, self.inputs['X'].shape[-1]]) - self.outputs['Out'] = self.outputs['Out'].reshape(self.inputs['X'] - .shape) +support_types = get_xpu_op_support_types('label_smooth') +for stype in support_types: + create_test_class(globals(), XPUTestLabelSmoothOp, stype) if __name__ == '__main__': unittest.main() diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index a36f173454f6a..f9826da20ce08 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -213,8 +213,8 @@ fi NO_NPU_FILE=`git diff --name-only upstream/$BRANCH | grep -v "_npu.py"` HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH ${NO_NPU_FILE} | grep "^+[[:space:]]\{0,\}@unittest.skip" || true` if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1 or qili93) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n" - check_approval 1 22165420 6836917 46661762 26922892 16605440 + echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder, luotao1, QingshuChen or qili93) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n" + check_approval 1 22165420 6836917 46661762 26922892 16605440 2002279 fi HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true` From 18c6f40b96a0aee0a1943ec0ad762b3e38ba4834 Mon Sep 17 00:00:00 2001 From: Baibaifan <39549453+Baibaifan@users.noreply.github.com> Date: Thu, 17 Feb 2022 15:46:58 +0800 Subject: [PATCH 14/19] optimizer sharding paramters (#39581) --- .../sharding_optimizer_stage2.py | 24 ++++++++++- .../meta_parallel/sharding/sharding_stage2.py | 40 +------------------ .../meta_parallel/sharding/sharding_stage3.py | 19 ++++----- .../unittests/dygraph_sharding_stage2.py | 15 +++---- .../dygraph_sharding_stage2_offload.py | 5 +-- .../unittests/dygraph_sharding_stage3.py | 14 ++----- .../dygraph_sharding_stage3_offload.py | 6 +-- 7 files changed, 45 insertions(+), 78 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index ea17f96f7a1ca..08baeae89ad4a 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -65,9 +65,9 @@ def __init__(self, params, optim, group=None, - broadcast_fp16=False, offload=False, device="gpu", + pertrain_sync_models=True, **kw): super().__init__(optim._learning_rate, params, kw) @@ -98,8 +98,12 @@ def __init__(self, self.world_size = self.group.nranks self.rank = self.group.rank + self._global_root_rank = 0 + + # Synchronous all ranks models + if pertrain_sync_models: + self._sync_params_and_buffers() - self.broadcast_fp16 = broadcast_fp16 self.param_storages = {} # {dtype: {rank: InternalStorage}} if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): @@ -132,6 +136,22 @@ def __init__(self, # Update optimizer parameters and adjust parameter storage and use according to rank. self._update_opt_status() + @paddle.no_grad() + def _sync_params_and_buffers(self): + """ + Sync all model states for all ranks + """ + + for p in self._local_params: + dist.broadcast( + p, + src=self._global_root_rank, + group=self.group, + use_calc_stream=True) + + # Multi stream operation will be supported later + dist.wait(tensor=p, group=self.group, use_calc_stream=True) + def _generate_master_params(self, trainable_params): if self.offload: for param in trainable_params: diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py index d884c416fa92c..e654f88f0b7b8 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py @@ -61,12 +61,10 @@ def __init__( sharding_optimizer, group=None, sync_buffers=False, - pertrain_sync_models=True, buffer_max_size=2**23, #8MB auto_refresh_trainable=True, device="gpu", - use_grad_storage=True, - accumulate_grads=False): + use_grad_storage=True): super().__init__() # training options @@ -81,9 +79,6 @@ def __init__( self._sync_buffers = sync_buffers self._auto_refresh_trainable = auto_refresh_trainable - # Gradient accumulation, Gradient flip - self._accumulate_grads = accumulate_grads - # Communication related attributes self._group = dist.new_group(_get_global_group() .ranks) if group is None else group @@ -128,16 +123,11 @@ def __init__( # Set backward pass hooks self._bw_hooks = [] - # Synchronous all ranks models - if pertrain_sync_models: - self._sync_params_and_buffers() - # Set tasks flow self._tasks_flow = deque() # Define optimizer step and clear_grad - if self._accumulate_grads: - self._redefine_opt_step() + self._redefine_opt_step() self._redefine_opt_clear() def forward(self, *inputs, **kwargs): @@ -313,9 +303,6 @@ def reduce(*_): # Change reduce information self._grad_reduced[index] = False - if not self._accumulate_grads: - param.grad.scale_(scale=self._world_size_scaling) - param._reset_grad_inplace_version(True) # Clear the gradient that does not belong to the current rank through the callback function def cleanup(): @@ -362,11 +349,6 @@ def reduce(*_): if grad_storage.all_checked_in: assert grad_storage.buffer is not None - # Normalize all ranks grad_storage - if not self._accumulate_grads: - grad_storage.buffer.scale_( - scale=self._world_size_scaling) - # Clearing up the grad_storage buffer def cleanup(): if dst_rank != self._rank: @@ -432,22 +414,6 @@ def _setup_backward_hooks(self): self._bw_hooks.append( param._register_backward_hook(reduce_function)) - @paddle.no_grad() - def _sync_params_and_buffers(self): - """ - Sync all model states for all ranks - """ - - for t in self._layer.parameters(): - dist.broadcast( - t, - src=self._global_root_rank, - group=self._group, - use_calc_stream=True) - - # Multi stream operation will be supported later - dist.wait(tensor=t, group=self._group, use_calc_stream=True) - def _setup_use_grad_storage(self): """ Integrate the parameters gradient into a continuous memory according to rank, and support the update of training parameters. @@ -555,8 +521,6 @@ def _rank_buffer_size(self, buffer_max_size, model_size): return rank_buffer_size def _redefine_opt_step(self): - if not self._accumulate_grads: - return grad_func = self._grad_scale for opt in self._sharding_optimizers: opt_step = opt.step diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py index 00c72e28a6ffd..9f9811b9eb0fc 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py @@ -72,7 +72,6 @@ def __init__(self, device="gpu", segment_size=2**15, pertrain_sync_models=True, - accumulate_grads=False, offload=False, sync_comm=False): super().__init__() @@ -82,7 +81,6 @@ def __init__(self, self._layer = layer self._default_device = device self.__sync_buffers = sync_buffers - self._accumulate_grads = accumulate_grads self._offload = offload self._sync_comm = sync_comm # segmentation size @@ -190,6 +188,7 @@ def _clear_gradients(self): param.fw_storage.clear_gradient(False) param.fw_storage._gradient_set_empty(False) param.bw_storage._clear() + param.bw_storage = None # 2.Handle unslice param if not self._offload: for grad_storage in self._grad_storages.values(): @@ -446,13 +445,12 @@ def _update_params(self): param, "fw_storage"), "Find {} don't have fw_storage attribute".format( param.name) - - if self._accumulate_grads: - if self._offload: - with device_guard(device="cpu"): - param.bw_storage.scale_(scale=self._world_size_scaling) - else: + # Gradient average + if self._offload: + with device_guard(device="cpu"): param.bw_storage.scale_(scale=self._world_size_scaling) + else: + param.bw_storage.scale_(scale=self._world_size_scaling) param.fw_storage = _VarBaseWrapper(param) assert param.fw_storage.grad is None param.fw_storage._copy_gradient_from(param.bw_storage) @@ -526,8 +524,6 @@ def _get_allreduce_fn(self, param): def reduce(*_): if param.name in self._task_flow.full_grad.keys(): full_grad = self._task_flow.full_grad[param.name] - if not self._accumulate_grads: - full_grad.scale_(scale=self._world_size_scaling) # Only support sync allreduce current rank's layer now dist.all_reduce( tensor=full_grad, group=self._group, use_calc_stream=True) @@ -535,8 +531,7 @@ def reduce(*_): tensor=full_grad, group=self._group, use_calc_stream=True) start, end = self._param2buffer[param.name][self._rank] - if not self._accumulate_grads or param.bw_storage is None or not param.bw_storage.value( - ).get_tensor()._is_initialized(): + if param.bw_storage is None: param.bw_storage = core.VarBase( full_grad._slice(start, end)).detach().clone() if self._offload: diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 80acf7217e76f..06935e212c3cb 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -27,7 +27,7 @@ from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 -seed = 2021 +seed = 2022 epoch = 2 linear_size = 1000 @@ -105,11 +105,7 @@ def train_mlp(model, params=model.parameters(), optim=optimizer, group=group) model = ShardingStage2( - model, - optimizer, - group=group, - buffer_max_size=2**21, - accumulate_grads=batch_size == 20) + model, optimizer, group=group, buffer_max_size=2**21) else: optimizer = fleet.distributed_optimizer(optimizer) model = fleet.distributed_model(model) @@ -140,6 +136,8 @@ def train_mlp(model, loss = paddle.nn.functional.cross_entropy(input=out, label=label) avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + if batch_size == 20: + avg_loss = avg_loss / 5 avg_loss.backward() if not accumulate_grad: @@ -166,6 +164,7 @@ def test_dp_stage2(): mlp4.set_state_dict(state_dict) mlp5.set_state_dict(state_dict) + # DP VS stage2 dp_params = train_mlp( mlp1, sharding_stage="dp", use_pure_fp16=False, opt_group=False) stage2_params = train_mlp( @@ -174,7 +173,8 @@ def test_dp_stage2(): np.testing.assert_allclose( dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6) - stage2_params = train_mlp(mlp3, sharding_stage=2) + # stage2 accumulate grad + stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True) stage2_accumulate_grad = train_mlp( mlp4, sharding_stage=2, batch_size=20, accumulate_grad=True) for i in range(len(stage2_params)): @@ -184,6 +184,7 @@ def test_dp_stage2(): rtol=1e-5, atol=1e-5) + # stage2 param list VS param group stage2_params = train_mlp( mlp2, sharding_stage=2, use_pure_fp16=False, opt_group=True) for i in range(len(dp_params)): diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py index 84ffe9094d812..39ba44815d940 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -43,13 +43,12 @@ def train_mlp(model, offload=False): optimizer = optimizer_setting(model=model, use_pure_fp16=True) model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') - scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) scaler = ShardingScaler(scaler) optimizer = ShardingOptimizerStage2( params=model.parameters(), optim=optimizer, offload=offload) - model = ShardingStage2( - model, optimizer, buffer_max_size=2**21, accumulate_grads=False) + model = ShardingStage2(model, optimizer, buffer_max_size=2**21) train_reader = paddle.batch( reader_decorator(linear_size), batch_size=batch_size, drop_last=True) diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py index 9bb1f85f327c3..6b755cf4c2b59 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3.py @@ -101,18 +101,10 @@ def train_mlp(model, optimizer = ShardingOptimizerStage2( params=model.parameters(), optim=optimizer, group=group) model = ShardingStage2( - model, - optimizer, - group=group, - buffer_max_size=2**21, - accumulate_grads=batch_size == 20) + model, optimizer, group=group, buffer_max_size=2**21) elif sharding_stage == 3: model = ShardingStage3( - model, - optimizer=optimizer, - group=group, - accumulate_grads=batch_size == 20, - sync_comm=recompute) + model, optimizer=optimizer, group=group, sync_comm=recompute) # check optimizer.minimize() error if test_minimize: @@ -231,7 +223,7 @@ def test_stage2_stage3(): stage2_params[i].numpy(), stage3_params[i].numpy(), rtol=1e-4, - atol=1e-4) + atol=1e-3) # fp16 recompute stage3_params = train_mlp( diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py index aa440549cf147..df7ba78d345a3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage3_offload.py @@ -91,11 +91,7 @@ def train_mlp(model, scaler = ShardingScaler(scaler) model = ShardingStage3( - model, - optimizer=optimizer, - group=group, - offload=offload, - accumulate_grads=accumulate_grad) + model, optimizer=optimizer, group=group, offload=offload) train_reader = paddle.batch( reader_decorator(), batch_size=batch_size, drop_last=True) From db43b541a46ebe8bb36dcbd80e0054a14c440b5a Mon Sep 17 00:00:00 2001 From: zyfncg Date: Thu, 17 Feb 2022 16:40:44 +0800 Subject: [PATCH 15/19] [Pten] Remove register of matmul_v2 kernel (#39542) * remove register of matmul_v2 kernel * delete matmul_v2 grad register in fluid --- .../performance_tests/benchmark_eager_cpu.cc | 2 +- .../performance_tests/benchmark_eager_cuda.cc | 2 +- .../performance_tests/benchmark_fluid_cpu.cc | 2 +- .../performance_tests/benchmark_fluid_cuda.cc | 2 +- .../eager/tests/task_tests/generated_test.cc | 2 +- paddle/fluid/operators/matmul_v2_op.cc | 34 ----- paddle/fluid/operators/matmul_v2_op.cu | 52 -------- paddle/fluid/operators/matmul_v2_op.h | 122 ------------------ 8 files changed, 5 insertions(+), 213 deletions(-) delete mode 100644 paddle/fluid/operators/matmul_v2_op.cu diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 8aa6b7b846074..ca7f0a61049e9 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -177,5 +177,5 @@ TEST(Benchmark, EagerIntermediateMLPCPU) { USE_OP_ITSELF(scale); USE_OP_ITSELF(elementwise_add); -USE_OP(matmul_v2); +USE_OP_ITSELF(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 53d97b2919a5b..288d09787bd49 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -186,7 +186,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { } USE_OP_ITSELF(scale); -USE_OP(matmul_v2); +USE_OP_ITSELF(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 0b2585905d3ed..3797dc92ded0e 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -213,5 +213,5 @@ TEST(Benchmark, FluidMLPCPU) { USE_OP_ITSELF(scale); USE_OP_ITSELF(elementwise_add); -USE_OP(matmul_v2); +USE_OP_ITSELF(matmul_v2); USE_OP(reduce_sum); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index 9cebb73a34a7f..7a449750a1c99 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -246,7 +246,7 @@ TEST(Benchmark, FluidMLPCUDA) { } // namespace paddle USE_OP_ITSELF(scale); -USE_OP(matmul_v2); +USE_OP_ITSELF(matmul_v2); USE_OP(reduce_sum); USE_OP(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index e3bdba05e9736..0c2dd0e3a667c 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -124,4 +124,4 @@ TEST(Generated, ElementwiseAdd) { USE_OP(sigmoid); USE_OP_ITSELF(elementwise_add); -USE_OP(matmul_v2); +USE_OP_ITSELF(matmul_v2); diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 40f2b625f6500..375705e8db2b2 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -538,37 +538,3 @@ REGISTER_OPERATOR(matmul_v2_grad_grad, ops::MatMulV2OpDoubleGrad, ops::MatMulV2OpTripleGradMaker); REGISTER_OPERATOR(matmul_v2_triple_grad, ops::MatMulV2OpTripleGrad); - -REGISTER_OP_CPU_KERNEL( - matmul_v2, ops::MatMulV2Kernel, - ops::MatMulV2Kernel, - ops::MatMulV2Kernel>, - ops::MatMulV2Kernel>); - -REGISTER_OP_CPU_KERNEL( - matmul_v2_grad, - ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel>, - ops::MatMulV2GradKernel>); -REGISTER_OP_CPU_KERNEL( - matmul_v2_grad_grad, - ops::MatMulV2DoubleGradKernel, - ops::MatMulV2DoubleGradKernel, - ops::MatMulV2DoubleGradKernel>, - ops::MatMulV2DoubleGradKernel>); - -REGISTER_OP_CPU_KERNEL( - matmul_v2_triple_grad, - ops::MatMulV2TripleGradKernel, - ops::MatMulV2TripleGradKernel, - ops::MatMulV2TripleGradKernel>, - ops::MatMulV2TripleGradKernel>); diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu deleted file mode 100644 index c9602a1eab931..0000000000000 --- a/paddle/fluid/operators/matmul_v2_op.cu +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/matmul_v2_op.h" - -namespace ops = paddle::operators; -namespace plf = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - matmul_v2, ops::MatMulV2Kernel, - ops::MatMulV2Kernel, - ops::MatMulV2Kernel, - ops::MatMulV2Kernel>, - ops::MatMulV2Kernel>); - -REGISTER_OP_CUDA_KERNEL( - matmul_v2_grad, ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel>, - ops::MatMulV2GradKernel>); - -REGISTER_OP_CUDA_KERNEL( - matmul_v2_grad_grad, - ops::MatMulV2DoubleGradKernel, - ops::MatMulV2DoubleGradKernel, - ops::MatMulV2DoubleGradKernel, - ops::MatMulV2DoubleGradKernel>, - ops::MatMulV2DoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - matmul_v2_triple_grad, - ops::MatMulV2TripleGradKernel, - ops::MatMulV2TripleGradKernel, - ops::MatMulV2TripleGradKernel, - ops::MatMulV2TripleGradKernel>, - ops::MatMulV2TripleGradKernel>); diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index 6fac2d1038334..045f823b7b672 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -37,29 +37,6 @@ limitations under the License. */ namespace paddle { namespace operators { -using framework::Tensor; - -template -class MatMulV2Kernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); - bool trans_x = ctx.Attr("trans_x"); - bool trans_y = ctx.Attr("trans_y"); - - auto& dev_ctx = ctx.device_context(); - Out->mutable_data(X->place()); - - // call new kernel - pten::MatmulKernel( - static_cast::TYPE&>(dev_ctx), - *X, *Y, trans_x, trans_y, Out); - } -}; - // Reshape a rank-3 tensor from P x M x N to (P * M) x N. // Identity op if the tensor is not of rank 3. static framework::Tensor FoldInitDims(const framework::Tensor& input) { @@ -133,104 +110,5 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x, ReshapeTensorIntoMatrixSequence(y, mat_dim_y); } -template -class MatMulV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - if (dx) dx->mutable_data(ctx.GetPlace()); - if (dy) dy->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - - // call new kernel - pten::MatmulGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, transpose_x, transpose_y, dx, dy); - } -}; - -template -class MatMulV2DoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* dout = context.Input("DOut"); - auto* ddx = context.Input("DDX"); - auto* ddy = context.Input("DDY"); - - auto* dx = context.Output("DX"); - auto* dy = context.Output("DY"); - auto* ddout = context.Output("DDOut"); - - bool transpose_x = context.Attr("trans_x"); - bool transpose_y = context.Attr("trans_y"); - - if (dx) dx->mutable_data(context.GetPlace()); - if (dy) dy->mutable_data(context.GetPlace()); - if (ddout) ddout->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - - // call new kernel - pten::MatmulDoubleGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, *ddx, *ddy, transpose_x, transpose_y, dx, dy, ddout); - } -}; - -template -class MatMulV2TripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // get input - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* dout = context.Input("DOut"); - auto* ddx = context.Input("DDX"); - auto* ddy = context.Input("DDY"); - - auto* d_dx = context.Input("D_DX"); - auto* d_dy = context.Input("D_DY"); - auto* d_ddout = context.Input("D_DDOut"); - - // get output - auto* out_d_x = context.Output("D_X_out"); - auto* out_d_y = context.Output("D_Y_out"); - auto* out_d_dout = context.Output("D_DOut_out"); - - auto* out_d_ddx = context.Output("D_DDX_out"); - auto* out_d_ddy = context.Output("D_DDY_out"); - - bool transpose_x = context.Attr("trans_x"); - bool transpose_y = context.Attr("trans_y"); - - if (out_d_x) out_d_x->mutable_data(context.GetPlace()); - if (out_d_y) out_d_y->mutable_data(context.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(context.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(context.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - // call new kernel - pten::MatmulTripleGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, *ddx, *ddy, *d_dx, *d_dy, *d_ddout, transpose_x, - transpose_y, out_d_x, out_d_y, out_d_dout, out_d_ddx, out_d_ddy); - } -}; - } // namespace operators } // namespace paddle From c1c5c1fc66b2e32e523201ad587d6112059e6d3f Mon Sep 17 00:00:00 2001 From: wenbin Date: Thu, 17 Feb 2022 17:15:10 +0800 Subject: [PATCH 16/19] adaptive pool2d pass fix (#39600) * first commit * teller fix * bug fix * enable for pool2d only * fix global_pooling issue * pooling_type * fix test --- .../ir/adaptive_pool2d_convert_global_pass.cc | 13 +++++++- ...ptive_pool2d_convert_global_pass_tester.cc | 2 ++ paddle/fluid/inference/tensorrt/op_teller.cc | 7 +++++ ...ive_pool2d_convert_global_pass_autoscan.py | 31 ++++--------------- 4 files changed, 27 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc index c280b7c32ed21..7846016d7e7b2 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc @@ -72,7 +72,18 @@ void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const { for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (op->HasAttr("adaptive") && op->HasAttr("ksize")) { + if (op->Type() == "pool2d" && op->HasAttr("adaptive") && + op->HasAttr("ksize")) { + if (op->HasAttr("global_pooling")) { + bool global_pooling = + BOOST_GET_CONST(bool, op->GetAttr("global_pooling")); + if (global_pooling) return; + } + if (!op->HasAttr("pooling_type")) return; + std::string type = + BOOST_GET_CONST(std::string, op->GetAttr("pooling_type")); + // adaptive has no effect on max pooling + if (type == "max") return; bool adaptive = BOOST_GET_CONST(bool, op->GetAttr("adaptive")); std::vector ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc index 19b0c5ca7fc2b..8870b68fbc5c5 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass_tester.cc @@ -29,6 +29,8 @@ TEST(AdaptivePool2dConvertGlobalPass, basic) { AttributeMap attrs; attrs["adaptive"] = true; attrs["ksize"] = std::vector{1, 1}; + attrs["pooling_type"] = + std::string("avg"); // adaptive has no effect on max pooling layers.pool2d(x, false, &attrs); std::unique_ptr graph(new ir::Graph(layers.main_program())); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 799c6c55bb121..436c80d9a6bcf 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -225,6 +225,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Out").size(); return false; } + if (desc.HasAttr("data_format")) { + std::string data_format = + BOOST_GET_CONST(std::string, desc.GetAttr("data_format")); + if (data_format == "NHWC" || data_format == "NDHWC") { + return false; + } + } if (!desc.HasAttr("pooling_type")) { return false; } else { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py index 96c2a175208fa..a8c3009a5aea1 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py @@ -42,10 +42,14 @@ def sample_program_config(self, draw): st.integers( min_value=1, max_value=4), min_size=2, max_size=2)) - paddings = [0, 0] # only 0 0 is right + paddings = draw( + st.lists( + st.integers( + min_value=1, max_value=4), min_size=2, max_size=2)) + ceil_mode = draw(st.booleans()) exclusive = draw(st.booleans()) - global_pooling = False #only false is right + global_pooling = draw(st.booleans()) padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VAILD"])) pool_op = OpConfig( @@ -83,29 +87,6 @@ def sample_predictor_configs(self, program_config): use_calib_mode=False) yield config, ['pool2d'], (1e-5, 1e-5) - def add_ignore_pass_case(self): - # Here we put some skip rules to avoid known bugs - def teller1(program_config, predictor_config): - if program_config.ops[0].attrs["pooling_type"] == "max": - x_shape = list(program_config.inputs["input_data"].shape) - if x_shape[-1] != 1 or x_shape[-2] != 1: - return True - return False - - def teller2(program_config, predictor_config): - if program_config.ops[0].attrs["padding_algorithm"] == "SAME": - return True - return False - - self.add_ignore_check_case( - teller1, - IgnoreReasons.PASS_ACCURACY_ERROR, - "max pooling has diff if H or W is not equals to 1", ) - self.add_ignore_check_case( - teller2, - IgnoreReasons.PASS_ACCURACY_ERROR, - "output has wrong result if padding_algorithm equals to SAME", ) - def test(self): self.run_and_statis( quant=False, From 9f99b591f45f66806cd4f1ada87ff1cefce4f2a2 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:17:14 +0800 Subject: [PATCH 17/19] add softplus op for kunlun2. test=kunlun (#39555) * add softplus op for kunlun2. test=kunlun * add softplus op for kunlun2. test=kunlun * fix code style. test=kunlun * fix code style. test=kunlun * add more test cases. test=kunlun --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/activation_op_xpu.cc | 50 ++++++++++++++++- .../fluid/platform/device/xpu/xpu2_op_list.h | 3 ++ .../unittests/xpu/test_activation_op_xpu.py | 53 +++++++++++++++++++ 4 files changed, 106 insertions(+), 2 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index c86748dc5e81b..5e60f1f2b99fe 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220119") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc index 60188ee53ef07..62fb98b63a837 100644 --- a/paddle/fluid/operators/activation_op_xpu.cc +++ b/paddle/fluid/operators/activation_op_xpu.cc @@ -14,8 +14,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/activation_op.h" #include + +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { @@ -364,6 +366,50 @@ struct XPUPowFunctor : public BaseActivationFunctor { } }; +template +struct XPUSoftPlusFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Out"); + const T *x_data = x->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + + float beta = ctx.Attr("beta"); + float threshold = ctx.Attr("threshold"); + + auto xpu_context = + ctx.device_context().x_context(); + int r = + xpu::softplus(xpu_context, x_data, y_data, x->numel(), beta, threshold); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus"); + } +}; + +template +struct XPUSoftPlusGradFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + const auto *x = ctx.Input("X"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + auto *dX = ctx.Output(framework::GradVarName("X")); + const T *x_data = x->data(); + const T *y_grad = dOut->data(); + T *x_grad = dX->mutable_data(ctx.GetPlace()); + + float beta = ctx.Attr("beta"); + float threshold = ctx.Attr("threshold"); + + auto xpu_context = + ctx.device_context().x_context(); + int r = xpu::softplus_grad( + xpu_context, reinterpret_cast(x_data), + reinterpret_cast( + x_data), // softplus_grad do not need y_data + reinterpret_cast(y_grad), + reinterpret_cast(x_grad), dX->numel(), beta, threshold); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "softplus_grad"); + } +}; + } // namespace operators } // namespace paddle @@ -388,6 +434,8 @@ REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor, XPUSigmoidGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor) REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSquareGradFunctor) +REGISTER_ACTIVATION_XPU_KERNEL(softplus, XPUSoftPlusFunctor, + XPUSoftPlusGradFunctor) REGISTER_OP_XPU_KERNEL( tanh, ops::XPUActivationKernel>, diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index d73d6f0b816b1..6e7c98dd7156c 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -317,6 +317,9 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP16, XPUPlace())})}, {"softmax_with_cross_entropy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softplus_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"split", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), pOpKernelType(vartype::INT32, XPUPlace())})}, {"squeeze2_grad", diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py index ce82b20eca42d..57af5739f5dbe 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py @@ -358,6 +358,59 @@ def test_check_grad(self): self.check_grad_with_place(place, ['X'], 'Out') +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSoftPlus(TestXPUActivation): + def setUp(self): + self.op_type = "softplus" + self.init_dtype() + self.init_config() + + beta = np.random.uniform(0, 1) + threshold = np.random.uniform(0, 1) + out = ref_softplus(self.x, beta, threshold) + + self.inputs = {'X': self.x} + self.outputs = {'Out': out} + self.attrs = {'use_xpu': True, 'beta': beta, 'threshold': threshold} + + def init_config(self): + self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) + + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSoftPlus2(TestXPUSoftPlus): + def init_config(self): + self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSoftPlus3(TestXPUSoftPlus): + def init_config(self): + self.x = np.random.uniform(-2, 2, [4, 512, 15, 15]).astype(self.dtype) + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXPUSoftPlus4(TestXPUSoftPlus): + def init_config(self): + self.x = np.random.uniform(-2, 2, [4, 256, 22, 22]).astype(self.dtype) + + +def ref_softplus(x, beta=1, threshold=20): + x_beta = beta * x + out = np.select([x_beta <= threshold, x_beta > threshold], + [np.log(1 + np.exp(x_beta)) / beta, x]) + return out + + if __name__ == "__main__": paddle.enable_static() unittest.main() From c05cd7edae8cbe221c5e5d6e28cb12068a77f697 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 17 Feb 2022 17:22:47 +0800 Subject: [PATCH 18/19] [PTen] Clean useless header in pten core (#39560) * clean useless header in pten core * fix compiled failed * fix cmake target * fix typo * resolve conflict --- paddle/pten/api/lib/tensor.cc | 57 ++++--- paddle/pten/core/compat/convert_utils.h | 5 - paddle/pten/core/dense_tensor.cc | 8 +- paddle/pten/core/dense_tensor.h | 3 - paddle/pten/core/dense_tensor_impl.cc | 92 ++++++----- paddle/pten/core/kernel_utils.h | 2 +- paddle/pten/core/utils/data_type.h | 34 ++-- paddle/pten/kernels/CMakeLists.txt | 10 +- paddle/pten/kernels/complex_kernel.h | 24 +-- paddle/pten/kernels/cpu/cast_kernel.cc | 8 +- paddle/pten/kernels/cpu/complex_kernel.cc | 4 +- paddle/pten/kernels/cpu/concat_kernel.cc | 6 +- paddle/pten/kernels/cpu/dot_grad_kernel.cc | 4 +- paddle/pten/kernels/cpu/dot_kernel.cc | 4 +- .../kernels/cpu/elementwise_grad_kernel.cc | 20 +-- paddle/pten/kernels/cpu/full_kernel.cc | 10 +- paddle/pten/kernels/cpu/math_kernel.cc | 8 +- paddle/pten/kernels/cpu/matmul_grad_kernel.cc | 12 +- paddle/pten/kernels/cpu/matmul_kernel.cc | 4 +- paddle/pten/kernels/cpu/reduce.h | 8 +- paddle/pten/kernels/empty_kernel.cc | 30 ++-- paddle/pten/kernels/flatten_grad_kernel.cc | 4 +- paddle/pten/kernels/flatten_kernel.cc | 8 +- paddle/pten/kernels/funcs/CMakeLists.txt | 14 -- paddle/pten/kernels/funcs/common_shape.h | 2 + paddle/pten/kernels/funcs/math_function.cc | 119 +++++++------- paddle/pten/kernels/funcs/math_function.cu | 146 +++++++++--------- paddle/pten/kernels/funcs/transpose.cc | 76 --------- paddle/pten/kernels/funcs/transpose.cu | 124 --------------- paddle/pten/kernels/funcs/transpose.h | 62 -------- paddle/pten/kernels/gpu/cast_kernel.cu | 8 +- paddle/pten/kernels/gpu/complex_kernel.cu | 6 +- paddle/pten/kernels/gpu/concat_kernel.cu | 8 +- paddle/pten/kernels/gpu/dot_grad_kernel.cu | 4 +- paddle/pten/kernels/gpu/dot_kernel.cu | 4 +- .../kernels/gpu/elementwise_grad_kernel.cu | 30 ++-- paddle/pten/kernels/gpu/expand_grad_kernel.cu | 2 +- paddle/pten/kernels/gpu/expand_kernel.cu | 2 +- paddle/pten/kernels/gpu/full_kernel.cu | 8 +- paddle/pten/kernels/gpu/math_kernel.cu | 6 +- paddle/pten/kernels/gpu/matmul_grad_kernel.cu | 20 +-- paddle/pten/kernels/gpu/matmul_kernel.cu | 8 +- paddle/pten/kernels/gpu/norm_grad_kernel.cu | 2 +- paddle/pten/kernels/gpu/norm_kernel.cu | 2 +- paddle/pten/kernels/gpu/reduce.h | 36 ++--- paddle/pten/kernels/gpu/sign_kernel.cu | 2 +- paddle/pten/kernels/impl/full_kernel_impl.h | 7 +- .../kernels/impl/matmul_grad_kernel_impl.h | 2 +- paddle/pten/kernels/math_kernel.cc | 18 +-- paddle/pten/kernels/transfer_layout_kernel.cc | 4 +- paddle/pten/tests/api/scale_api.h | 2 +- paddle/pten/tests/kernels/CMakeLists.txt | 8 + .../kernels/test_math_function.cc} | 9 +- .../kernels/test_math_function.cu} | 109 ++++++------- 54 files changed, 481 insertions(+), 734 deletions(-) delete mode 100644 paddle/pten/kernels/funcs/transpose.cc delete mode 100644 paddle/pten/kernels/funcs/transpose.cu delete mode 100644 paddle/pten/kernels/funcs/transpose.h rename paddle/pten/{kernels/funcs/math_function_test.cc => tests/kernels/test_math_function.cc} (99%) rename paddle/pten/{kernels/funcs/math_function_test.cu => tests/kernels/test_math_function.cu} (85%) diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc index 40f35896323b9..aae11294b0be0 100644 --- a/paddle/pten/api/lib/tensor.cc +++ b/paddle/pten/api/lib/tensor.cc @@ -176,12 +176,12 @@ template PADDLE_API uint8_t *Tensor::mutable_data(); template PADDLE_API int8_t *Tensor::mutable_data(); template PADDLE_API int16_t *Tensor::mutable_data(); template PADDLE_API bool *Tensor::mutable_data(); -template PADDLE_API paddle::platform::complex - *Tensor::mutable_data>(); -template PADDLE_API paddle::platform::complex - *Tensor::mutable_data>(); -template PADDLE_API paddle::platform::float16 * -Tensor::mutable_data(); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(); +template PADDLE_API pten::dtype::float16 * +Tensor::mutable_data(); template T *Tensor::mutable_data(const PlaceType &place) { @@ -214,12 +214,12 @@ template PADDLE_API int8_t *Tensor::mutable_data( template PADDLE_API int16_t *Tensor::mutable_data( const PlaceType &place); template PADDLE_API bool *Tensor::mutable_data(const PlaceType &place); -template PADDLE_API paddle::platform::complex * -Tensor::mutable_data>(const PlaceType &place); -template PADDLE_API paddle::platform::complex * -Tensor::mutable_data>(const PlaceType &place); -template PADDLE_API paddle::platform::float16 * -Tensor::mutable_data(const PlaceType &place); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(const PlaceType &place); +template PADDLE_API pten::dtype::complex + *Tensor::mutable_data>(const PlaceType &place); +template PADDLE_API pten::dtype::float16 * +Tensor::mutable_data(const PlaceType &place); template const T *Tensor::data() const { @@ -241,14 +241,14 @@ template PADDLE_API const uint8_t *Tensor::data() const; template PADDLE_API const int8_t *Tensor::data() const; template PADDLE_API const int16_t *Tensor::data() const; template PADDLE_API const bool *Tensor::data() const; -template PADDLE_API const paddle::platform::complex - *Tensor::data>() const; -template PADDLE_API const paddle::platform::complex - *Tensor::data>() const; -template PADDLE_API const paddle::platform::float16 * -Tensor::data() const; -template PADDLE_API const paddle::platform::bfloat16 * -Tensor::data() const; +template PADDLE_API const pten::dtype::complex + *Tensor::data>() const; +template PADDLE_API const pten::dtype::complex + *Tensor::data>() const; +template PADDLE_API const pten::dtype::float16 * +Tensor::data() const; +template PADDLE_API const pten::dtype::bfloat16 * +Tensor::data() const; template T *Tensor::data() { @@ -267,12 +267,11 @@ template PADDLE_API uint8_t *Tensor::data(); template PADDLE_API int8_t *Tensor::data(); template PADDLE_API int16_t *Tensor::data(); template PADDLE_API bool *Tensor::data(); -template PADDLE_API paddle::platform::complex - *Tensor::data>(); -template PADDLE_API paddle::platform::complex - *Tensor::data>(); -template PADDLE_API paddle::platform::float16 * -Tensor::data(); +template PADDLE_API pten::dtype::complex + *Tensor::data>(); +template PADDLE_API pten::dtype::complex + *Tensor::data>(); +template PADDLE_API pten::dtype::float16 *Tensor::data(); // TODO(chenweihang): replace slice impl by API Tensor Tensor::slice(int64_t begin_idx, int64_t end_idx) const { @@ -328,12 +327,12 @@ template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; template PADDLE_API Tensor Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( +template PADDLE_API Tensor Tensor::copy_to>( const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( +template PADDLE_API Tensor Tensor::copy_to>( const PlaceType &target_place) const; template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; +Tensor::copy_to(const PlaceType &target_place) const; Tensor Tensor::copy_to(Backend backend, bool blocking) const { return experimental::copy_to(*this, backend, blocking); diff --git a/paddle/pten/core/compat/convert_utils.h b/paddle/pten/core/compat/convert_utils.h index 0db71b577de51..fba2243808a97 100644 --- a/paddle/pten/core/compat/convert_utils.h +++ b/paddle/pten/core/compat/convert_utils.h @@ -20,11 +20,6 @@ limitations under the License. */ #include "paddle/pten/common/place.h" #include "paddle/pten/core/tensor_meta.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/data_type.h" - -// TODO(chenweihang): this file may need to be removed - namespace pten { std::string TransToPtenKernelName(const std::string& fluid_op_name); diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc index 36d56212e216a..82150e10bb313 100644 --- a/paddle/pten/core/dense_tensor.cc +++ b/paddle/pten/core/dense_tensor.cc @@ -202,12 +202,12 @@ DATA_MEMBER_FUNC_INSTANTIATION(int32_t); DATA_MEMBER_FUNC_INSTANTIATION(uint32_t); DATA_MEMBER_FUNC_INSTANTIATION(int64_t); DATA_MEMBER_FUNC_INSTANTIATION(uint64_t); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::bfloat16); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::float16); DATA_MEMBER_FUNC_INSTANTIATION(float); DATA_MEMBER_FUNC_INSTANTIATION(double); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64); -DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex); +DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex); #undef DATA_MEMBER_FUNC_INSTANTIATION diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h index 44232930e5f0d..280af941403a1 100644 --- a/paddle/pten/core/dense_tensor.h +++ b/paddle/pten/core/dense_tensor.h @@ -20,9 +20,6 @@ limitations under the License. */ #include "paddle/pten/core/tensor_base.h" #include "paddle/pten/core/tensor_meta.h" -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/data_type.h" - /* @jim19930609: Move to MKLDNN_Tensor in the future */ #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/pten/core/dense_tensor_impl.cc b/paddle/pten/core/dense_tensor_impl.cc index dfde62618d01c..7237f03dccf96 100644 --- a/paddle/pten/core/dense_tensor_impl.cc +++ b/paddle/pten/core/dense_tensor_impl.cc @@ -40,14 +40,14 @@ size_t DenseTensor::memory_size() const { } void DenseTensor::check_memory_size() const { - PADDLE_ENFORCE_NOT_NULL(holder_, - paddle::platform::errors::PreconditionNotMet( - "Tensor holds no memory. " - "Call Tensor::mutable_data firstly.")); + PADDLE_ENFORCE_NOT_NULL( + holder_, + pten::errors::PreconditionNotMet("Tensor holds no memory. " + "Call Tensor::mutable_data firstly.")); PADDLE_ENFORCE_LE( numel() * SizeOf(dtype()), memory_size(), - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor's dimension is out of bound." "Tensor's dimension must be equal or less than the size of its " "memory." @@ -56,10 +56,10 @@ void DenseTensor::check_memory_size() const { memory_size())); } -const paddle::platform::Place& DenseTensor::place() const { +const Place& DenseTensor::place() const { PADDLE_ENFORCE_NOT_NULL( holder_, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor not initialized yet when DenseTensor::place() is called.")); return holder_->place(); } @@ -82,7 +82,7 @@ void DenseTensor::ResetHolder(const std::shared_ptr& holder) { numel() * static_cast(SizeOf(dtype())) + static_cast(meta_.offset), static_cast(holder->size()), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The size of Holder is not enough to store the Tensor.")); } holder_ = holder; @@ -99,14 +99,14 @@ void DenseTensor::set_type(paddle::experimental::DataType type) { meta_.dtype = type; } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, +void* DenseTensor::mutable_data(const Place& place, paddle::experimental::DataType type, size_t requested_size) { set_type(type); PADDLE_ENFORCE_GE( numel(), 0, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The Tensor's element number must be equal or greater than zero. " "The Tensor's shape is [", dims(), @@ -127,19 +127,18 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, meta_.offset); } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { +void* DenseTensor::mutable_data(const Place& place, size_t requested_size) { return mutable_data(place, type(), requested_size); } -void* DenseTensor::mutable_data(const paddle::platform::Place& place, +void* DenseTensor::mutable_data(const Place& place, paddle::experimental::DataType type, const pten::Stream& stream) { set_type(type); PADDLE_ENFORCE_GE( numel(), 0, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "The Tensor's element number must be equal or greater than zero. " "The Tensor's shape is [", dims(), @@ -149,7 +148,7 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + meta_.offset || - !(paddle::platform::is_gpu_place(place) && + !(place.GetType() == pten::AllocationType::GPU && paddle::memory::InSameStream(holder_, stream))) { holder_.reset(); holder_ = paddle::memory::AllocShared(place, size, stream); @@ -166,7 +165,7 @@ void* DenseTensor::mutable_data(const paddle::platform::Place& place, */ template inline T* DenseTensor::mutable_data(const DDim& dims, - const paddle::platform::Place& place, + const Place& place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); meta_.dims = dims; @@ -174,8 +173,7 @@ inline T* DenseTensor::mutable_data(const DDim& dims, } template -inline T* DenseTensor::mutable_data(const paddle::platform::Place& place, - size_t requested_size) { +inline T* DenseTensor::mutable_data(const Place& place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); return reinterpret_cast( mutable_data(place, @@ -189,13 +187,11 @@ void DenseTensor::ShareBufferWith(const DenseTensor& tensor) { meta_.dtype = tensor.dtype(); } -#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ - template dtype* DenseTensor::mutable_data( \ - const DDim& dims, \ - const paddle::platform::Place& place, \ - size_t requested_size); \ - template dtype* DenseTensor::mutable_data( \ - const paddle::platform::Place& place, size_t requested_size); +#define LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(dtype) \ + template dtype* DenseTensor::mutable_data( \ + const DDim& dims, const Place& place, size_t requested_size); \ + template dtype* DenseTensor::mutable_data(const Place& place, \ + size_t requested_size); LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(bool) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int8_t) @@ -205,10 +201,10 @@ LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int32_t) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(int64_t) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(float) LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(double) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64) -LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::bfloat16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::float16) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex) +LEGACY_DATA_MEMBER_FUNC_INSTANTIATION(::pten::dtype::complex) #undef LEGACY_DATA_MEMBER_FUNC_INSTANTIATION @@ -234,7 +230,7 @@ std::pair DenseTensor::lod_element(size_t level, PADDLE_ENFORCE_LT( level, NumLevels(), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, @@ -242,7 +238,7 @@ std::pair DenseTensor::lod_element(size_t level, PADDLE_ENFORCE_LT(elem, NumElements(level), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input element of LoD is invalid, it should be " "less than the number of elements in its level." "The input element is %zu, the number of elements in " @@ -259,7 +255,7 @@ size_t DenseTensor::NumElements(size_t level) const { PADDLE_ENFORCE_LT( level, NumLevels(), - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The input level of LoD is invalid, it should be less than LoD " "size. The input level is %zu, the LoD size is %zu.", level, @@ -276,20 +272,20 @@ DenseTensor& DenseTensor::Resize(const DDim& dims) { DenseTensor DenseTensor::Slice(int64_t begin_idx, int64_t end_idx) const { check_memory_size(); - PADDLE_ENFORCE_GE(begin_idx, - 0, - paddle::platform::errors::OutOfRange( - "The start row index must be greater than 0." - "But received the start index is d%.", - begin_idx)); - PADDLE_ENFORCE_LE(end_idx, - meta_.dims[0], - paddle::platform::errors::OutOfRange( - "The end row index is out of bound.")); + PADDLE_ENFORCE_GE( + begin_idx, + 0, + pten::errors::OutOfRange("The start row index must be greater than 0." + "But received the start index is d%.", + begin_idx)); + PADDLE_ENFORCE_LE( + end_idx, + meta_.dims[0], + pten::errors::OutOfRange("The end row index is out of bound.")); PADDLE_ENFORCE_LT( begin_idx, end_idx, - paddle::platform::errors::InvalidArgument( + pten::errors::InvalidArgument( "The start row index must be less than the end row index." "But received the start index = %d, the end index = %d.", begin_idx, @@ -317,13 +313,13 @@ std::vector DenseTensor::Split(int64_t split_size, PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects at least a 1-dimensional tensor")); PADDLE_ENFORCE_GE( split_size, 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects split_size be non-negative, but got split_size is %d", split_size)); @@ -350,12 +346,12 @@ std::vector DenseTensor::Chunk(int64_t chunks, check_memory_size(); PADDLE_ENFORCE_GE(meta_.dims.size(), 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "split expects at least a 1-dimensional tensor")); PADDLE_ENFORCE_GE( chunks, 0, - paddle::platform::errors::OutOfRange( + pten::errors::OutOfRange( "chunks expects to be greater than 0, but got chunks is %d", chunks)); int64_t numel_size = meta_.dims[axis]; @@ -376,7 +372,7 @@ DenseTensor& DenseTensor::ShareInplaceVersionCounterWith( const DenseTensor& src) { PADDLE_ENFORCE_NOT_NULL( inplace_version_counter_, - paddle::platform::errors::PreconditionNotMet( + pten::errors::PreconditionNotMet( "Tensor does not hold inplace_version_counter_.")); inplace_version_counter_ = src.inplace_version_counter_; diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h index 8bc125c50bed6..01632b7e58672 100644 --- a/paddle/pten/core/kernel_utils.h +++ b/paddle/pten/core/kernel_utils.h @@ -233,7 +233,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t); - PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16); + PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(pten::dtype::float16); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); diff --git a/paddle/pten/core/utils/data_type.h b/paddle/pten/core/utils/data_type.h index ca0c678e0623d..924040bf890f6 100644 --- a/paddle/pten/core/utils/data_type.h +++ b/paddle/pten/core/utils/data_type.h @@ -26,23 +26,23 @@ namespace pten { #define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \ callback(cpp_type, data_type); -#define _PtenForEachDataType_(callback) \ - _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::float16, DataType::FLOAT16); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::bfloat16, DataType::BFLOAT16); \ - _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ - _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ - _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ - _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ - _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ - _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ - _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::complex, DataType::COMPLEX64); \ - _PtenForEachDataTypeHelper_( \ - callback, ::paddle::platform::complex, DataType::COMPLEX128); +#define _PtenForEachDataType_(callback) \ + _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::float16, DataType::FLOAT16); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::bfloat16, DataType::BFLOAT16); \ + _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ + _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ + _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ + _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ + _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ + _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::complex, DataType::COMPLEX64); \ + _PtenForEachDataTypeHelper_( \ + callback, ::pten::dtype::complex, DataType::COMPLEX128); template inline void VisitDataType(pten::DataType type, Visitor visitor) { diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index a9b81ad4eb2b3..20067eb5b0470 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -15,14 +15,10 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) -set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu) -if(WITH_GPU OR WITH_ROCM) - set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu) -endif() - # auto build kernel targets by cmake -register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS}) -kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS}) +register_kernels(DEPS ${COMMON_KERNEL_DEPS}) + +# pten sparse kernels add_subdirectory(sparse) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/pten/kernels/complex_kernel.h b/paddle/pten/kernels/complex_kernel.h index ab1cb59872a04..867af865fe0fd 100644 --- a/paddle/pten/kernels/complex_kernel.h +++ b/paddle/pten/kernels/complex_kernel.h @@ -25,12 +25,12 @@ template void ConjKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); // If T is complex -template >::value || - std::is_same>::value, - bool> = true> +template < + typename T, + typename Context, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { auto dense_out = pten::Empty(dev_ctx); MetaTensor meta_out(&dense_out); @@ -40,12 +40,12 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { } // If T is not complex -template >::value && - !std::is_same>::value, - bool> = true> +template < + typename T, + typename Context, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc index 24371ca7690de..7303028cd7a16 100644 --- a/paddle/pten/kernels/cpu/cast_kernel.cc +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -69,9 +69,9 @@ PT_REGISTER_KERNEL(cast, int16_t, bool, uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) { + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) { kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } diff --git a/paddle/pten/kernels/cpu/complex_kernel.cc b/paddle/pten/kernels/cpu/complex_kernel.cc index 6cdba15620fe3..e1bbe1ff00ed0 100644 --- a/paddle/pten/kernels/cpu/complex_kernel.cc +++ b/paddle/pten/kernels/cpu/complex_kernel.cc @@ -25,8 +25,8 @@ PT_REGISTER_KERNEL(conj, CPU, ALL_LAYOUT, pten::ConjKernel, - paddle::platform::complex, - paddle::platform::complex, + pten::dtype::complex, + pten::dtype::complex, float, double, int, diff --git a/paddle/pten/kernels/cpu/concat_kernel.cc b/paddle/pten/kernels/cpu/concat_kernel.cc index c4aed7679bd72..a9ecd15e68d87 100644 --- a/paddle/pten/kernels/cpu/concat_kernel.cc +++ b/paddle/pten/kernels/cpu/concat_kernel.cc @@ -120,6 +120,6 @@ PT_REGISTER_KERNEL(concat, int64_t, int, uint8_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/dot_grad_kernel.cc b/paddle/pten/kernels/cpu/dot_grad_kernel.cc index 91202cf836df5..2705c0667941c 100644 --- a/paddle/pten/kernels/cpu/dot_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_grad_kernel.cc @@ -28,5 +28,5 @@ PT_REGISTER_KERNEL(dot_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc index 5cef8d0bdd56d..5166d9c061f95 100644 --- a/paddle/pten/kernels/cpu/dot_kernel.cc +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -46,8 +46,8 @@ void DotKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(dot, CPU, diff --git a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc index d3d3aa79edb39..002b575341a16 100644 --- a/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/elementwise_grad_kernel.cc @@ -134,8 +134,8 @@ PT_REGISTER_KERNEL(add_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_double_grad, CPU, @@ -145,8 +145,8 @@ PT_REGISTER_KERNEL(add_double_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_triple_grad, CPU, @@ -156,8 +156,8 @@ PT_REGISTER_KERNEL(add_triple_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_grad, CPU, @@ -167,8 +167,8 @@ PT_REGISTER_KERNEL(subtract_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_double_grad, CPU, @@ -178,5 +178,5 @@ PT_REGISTER_KERNEL(subtract_double_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/full_kernel.cc b/paddle/pten/kernels/cpu/full_kernel.cc index 919471d86ac53..62e1bbf1d9d9c 100644 --- a/paddle/pten/kernels/cpu/full_kernel.cc +++ b/paddle/pten/kernels/cpu/full_kernel.cc @@ -29,10 +29,10 @@ PT_REGISTER_KERNEL(full, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(full_like, CPU, @@ -43,4 +43,4 @@ PT_REGISTER_KERNEL(full_like, int, int64_t, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index d4987e7a36069..70e90587123fe 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -113,11 +113,11 @@ DEFINE_CPU_ELEMENTWISE_OP(Multiply) } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 -// using bfloat16 = ::paddle::platform::bfloat16; +// using bfloat16 = ::pten::dtype::bfloat16; PT_REGISTER_KERNEL(add_raw, CPU, ALL_LAYOUT, @@ -166,7 +166,7 @@ PT_REGISTER_KERNEL(sum_raw, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, diff --git a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc index 955f2b017b0dc..fa0fd0c8d4c0a 100644 --- a/paddle/pten/kernels/cpu/matmul_grad_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_grad_kernel.cc @@ -25,8 +25,8 @@ PT_REGISTER_KERNEL(matmul_grad, pten::MatmulGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_double_grad, CPU, @@ -34,8 +34,8 @@ PT_REGISTER_KERNEL(matmul_double_grad, pten::MatmulDoubleGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_triple_grad, CPU, @@ -43,5 +43,5 @@ PT_REGISTER_KERNEL(matmul_triple_grad, pten::MatmulTripleGradKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/matmul_kernel.cc b/paddle/pten/kernels/cpu/matmul_kernel.cc index 51def07d4031f..46e2c49ab115b 100644 --- a/paddle/pten/kernels/cpu/matmul_kernel.cc +++ b/paddle/pten/kernels/cpu/matmul_kernel.cc @@ -26,5 +26,5 @@ PT_REGISTER_KERNEL(matmul, pten::MatmulKernel, float, double, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/cpu/reduce.h b/paddle/pten/kernels/cpu/reduce.h index bdf9e65f54188..0882e13a8522b 100644 --- a/paddle/pten/kernels/cpu/reduce.h +++ b/paddle/pten/kernels/cpu/reduce.h @@ -23,7 +23,7 @@ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/kernels/funcs/eigen/common.h" -#include "paddle/pten/kernels/funcs/transpose.h" +#include "paddle/pten/kernels/funcs/math_function.h" // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/operators/eigen/eigen_function.h" namespace pten { @@ -80,7 +80,7 @@ void ReduceFunctor(const DeviceContext& context, inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims, const std::vector& reduced_dims, - std::vector* perm_axis) { + std::vector* perm_axis) { // check if it's a reduced dim std::vector src_dims_check(src_dims.size(), false); size_t src_size = src_dims.size(); @@ -115,13 +115,13 @@ void GetShuffledInput(const DeviceContext& dev_ctx, pten::DenseTensor* shuffled_input, const std::vector& dims) { DDim shuffled_dims(input.dims()); - std::vector perm_axis(input.dims().size()); + std::vector perm_axis(input.dims().size()); GetShuffledDim(input.dims(), &shuffled_dims, dims, &perm_axis); shuffled_input->ResizeAndAllocate(shuffled_dims); dev_ctx.template Alloc(shuffled_input); - pten::math::TransposeNormal trans; + pten::funcs::TransposeNormal trans; trans(dev_ctx, input, shuffled_input, perm_axis); } diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index e1a1788815ebf..03fe240a88b13 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -45,10 +45,10 @@ PT_REGISTER_KERNEL(empty, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(empty_like, CPU, @@ -61,10 +61,10 @@ PT_REGISTER_KERNEL(empty_like, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PT_REGISTER_KERNEL(empty, @@ -78,9 +78,9 @@ PT_REGISTER_KERNEL(empty, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(empty_like, GPU, @@ -93,8 +93,8 @@ PT_REGISTER_KERNEL(empty_like, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} #endif diff --git a/paddle/pten/kernels/flatten_grad_kernel.cc b/paddle/pten/kernels/flatten_grad_kernel.cc index cbbf62f1993e2..7850f1170b832 100644 --- a/paddle/pten/kernels/flatten_grad_kernel.cc +++ b/paddle/pten/kernels/flatten_grad_kernel.cc @@ -49,7 +49,7 @@ PT_REGISTER_KERNEL(flatten_grad, ALL_LAYOUT, pten::FlattenGradKernel, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -64,7 +64,7 @@ PT_REGISTER_KERNEL(flatten_grad, ALL_LAYOUT, pten::FlattenGradKernel, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc index b0d05803ac351..0ae6cd1b9c35e 100644 --- a/paddle/pten/kernels/flatten_kernel.cc +++ b/paddle/pten/kernels/flatten_kernel.cc @@ -76,7 +76,7 @@ PT_REGISTER_KERNEL(flatten, ALL_LAYOUT, pten::FlattenKernel, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -88,7 +88,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, ALL_LAYOUT, pten::FlattenWithXShape, float, - paddle::platform::float16, + pten::dtype::float16, double, uint8_t, int8_t, @@ -102,7 +102,7 @@ PT_REGISTER_KERNEL(flatten, ALL_LAYOUT, pten::FlattenKernel, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} @@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape, ALL_LAYOUT, pten::FlattenWithXShape, float, - paddle::platform::float16, + pten::dtype::float16, int8_t, int, int64_t) {} diff --git a/paddle/pten/kernels/funcs/CMakeLists.txt b/paddle/pten/kernels/funcs/CMakeLists.txt index e4dd437629a9b..844464a52dcbf 100644 --- a/paddle/pten/kernels/funcs/CMakeLists.txt +++ b/paddle/pten/kernels/funcs/CMakeLists.txt @@ -1,12 +1,5 @@ add_subdirectory(eigen) -cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context) -if(WITH_GPU) - nv_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context) -elseif(WITH_ROCM) - hip_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context) -endif() - function(math_library TARGET) # math_library is a function to create math library. # The interface is the same as cc_library. @@ -47,10 +40,3 @@ function(math_library TARGET) endfunction() math_library(math_function DEPS blas dense_tensor tensor) -cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) -if(WITH_GPU) - nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function) -endif() -if(WITH_ROCM) - hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) -endif() diff --git a/paddle/pten/kernels/funcs/common_shape.h b/paddle/pten/kernels/funcs/common_shape.h index e751f85b50f24..c947771900304 100644 --- a/paddle/pten/kernels/funcs/common_shape.h +++ b/paddle/pten/kernels/funcs/common_shape.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/eigen/common.h" +#include "paddle/pten/kernels/funcs/eigen/eigen_function.h" namespace pten { namespace funcs { diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc index dec89e79565de..facb26a552019 100644 --- a/paddle/pten/kernels/funcs/math_function.cc +++ b/paddle/pten/kernels/funcs/math_function.cc @@ -36,12 +36,12 @@ limitations under the License. */ namespace pten { namespace funcs { -using float16 = paddle::platform::float16; +using float16 = pten::dtype::float16; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -50,12 +50,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -63,15 +63,14 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; +template struct SetConstant>; +template struct SetConstant>; #ifdef PADDLE_WITH_XPU template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -80,17 +79,17 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; #endif #define DEFINE_CPU_TRANS(RANK) \ template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ template struct Transpose; \ @@ -107,10 +106,26 @@ template struct SetConstant; \ template struct Transpose; \ template struct Transpose, \ + pten::dtype::complex, \ RANK>; \ template struct Transpose, \ + pten::dtype::complex, \ + RANK>; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ + template struct Transpose, \ RANK>; DEFINE_CPU_TRANS(1); @@ -120,41 +135,41 @@ DEFINE_CPU_TRANS(4); DEFINE_CPU_TRANS(5); DEFINE_CPU_TRANS(6); -template -struct TransposeNormal { - void operator()(const paddle::platform::CPUDeviceContext& context, - const paddle::framework::Tensor& in, - paddle::framework::Tensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); - const T* in_ptr = in.data(); - T* out_ptr = out->data(); - - auto transpose_helper = [&](int64_t beg, int64_t end) { - for (int64_t out_idx = beg; out_idx < end; ++out_idx) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - // calculate the input index - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride[i]; - tmp_idx -= coordinate * out_stride[i]; - in_idx += coordinate * in_stride[axis[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; +template +void TransposeNormal::operator()( + const DeviceContext& context, + const paddle::framework::Tensor& in, + paddle::framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = paddle::framework::stride(in.dims()); + auto out_stride = paddle::framework::stride(out->dims()); + const T* in_ptr = in.data(); + T* out_ptr = out->data(); + + auto transpose_helper = [&](int64_t beg, int64_t end) { + for (int64_t out_idx = beg; out_idx < end; ++out_idx) { + int64_t in_idx = 0; + int64_t tmp_idx = out_idx; + // calculate the input index + for (int i = 0; i < rank; ++i) { + const int64_t coordinate = tmp_idx / out_stride[i]; + tmp_idx -= coordinate * out_stride[i]; + in_idx += coordinate * in_stride[axis[i]]; } - }; - transpose_helper(0, out->numel()); - } -}; + out_ptr[out_idx] = in_ptr[in_idx]; + } + }; + transpose_helper(0, out->numel()); +} // define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal +#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal; \ + template struct TransposeNormal -DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::float16); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::bfloat16); DEFINE_CPU_TRANS_NORMAL(float); DEFINE_CPU_TRANS_NORMAL(double); DEFINE_CPU_TRANS_NORMAL(int); @@ -163,8 +178,8 @@ DEFINE_CPU_TRANS_NORMAL(bool); DEFINE_CPU_TRANS_NORMAL(int16_t); DEFINE_CPU_TRANS_NORMAL(uint8_t); DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::complex); +DEFINE_CPU_TRANS_NORMAL(pten::dtype::complex); struct TensorSetConstantCPU { TensorSetConstantCPU(paddle::framework::Tensor* tensor, float value) @@ -343,7 +358,7 @@ struct ElementwiseAddTo { }; template struct ElementwiseAddTo; + pten::dtype::float16>; } // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu index 8ed72dbd1c127..d019a382d7717 100644 --- a/paddle/pten/kernels/funcs/math_function.cu +++ b/paddle/pten/kernels/funcs/math_function.cu @@ -27,13 +27,13 @@ limitations under the License. */ namespace pten { namespace funcs { -using float16 = paddle::platform::float16; -using bfloat16 = paddle::platform::bfloat16; +using float16 = pten::dtype::float16; +using bfloat16 = pten::dtype::bfloat16; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -42,12 +42,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -55,14 +55,13 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant>; -template struct SetConstant>; +template struct SetConstant>; +template struct SetConstant>; template struct SetConstant; + pten::dtype::float16>; template struct SetConstant; + pten::dtype::bfloat16>; template struct SetConstant; template struct SetConstant; template struct SetConstant; @@ -71,9 +70,9 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant>; + pten::dtype::complex>; template struct SetConstant>; + pten::dtype::complex>; #define DEFINE_GPU_TRANS(RANK) \ template struct Transpose; \ @@ -97,10 +96,24 @@ template struct SetConstant; \ template struct Transpose, \ + pten::dtype::complex, \ RANK>; \ template struct Transpose, \ + pten::dtype::complex, \ + RANK>; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, \ + RANK>; \ + template struct Transpose, \ RANK>; DEFINE_GPU_TRANS(1); @@ -133,60 +146,53 @@ __global__ void TransposeNormalKernel(const T* in_ptr, } } -template -struct TransposeNormal { - void operator()(const paddle::platform::CUDADeviceContext& context, - const paddle::framework::Tensor& in, - paddle::framework::Tensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = paddle::framework::stride(in.dims()); - auto out_stride = paddle::framework::stride(out->dims()); - auto* in_ptr = in.data(); - auto* out_ptr = out->data(); - - // copy in_stride, out_stride, axis to gpu device - const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); - paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); - size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); - auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); - REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); - REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); - for (int i = 0; i < rank; ++i) { - cpu_buf[i] = in_stride[i]; - cpu_buf[rank + i] = out_stride[i]; - cpu_buf[2 * rank + i] = axis[i]; - } - paddle::memory::Copy( - cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); - REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); - REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); - REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); +template +void TransposeNormal::operator()( + const DeviceContext& context, + const paddle::framework::Tensor& in, + paddle::framework::Tensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = paddle::framework::stride(in.dims()); + auto out_stride = paddle::framework::stride(out->dims()); + auto* in_ptr = in.data(); + auto* out_ptr = out->data(); - const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = - context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - int64_t elements = in.numel(); - int block_size = (elements >= MAX_BLOCK_DIM) - ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(elements))); - int grid_size = elements / block_size; - grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; - TransposeNormalKernel<<>>( - in_ptr, - out_ptr, - elements, - in_stride_ptr, - out_stride_ptr, - axis_ptr, - rank); + // copy in_stride, out_stride, axis to gpu device + const paddle::platform::CUDAPlace& cuda_place = context.GetPlace(); + paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); + size_t size = 3 * rank * sizeof(int64_t); + auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); + auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); + REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); + REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); + for (int i = 0; i < rank; ++i) { + cpu_buf[i] = in_stride[i]; + cpu_buf[rank + i] = out_stride[i]; + cpu_buf[2 * rank + i] = axis[i]; } -}; + paddle::memory::Copy( + cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); + REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); + REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); + REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); + + const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); + const int MAX_GRID_DIM = context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; + int64_t elements = in.numel(); + int block_size = (elements >= MAX_BLOCK_DIM) + ? MAX_BLOCK_DIM + : (1 << static_cast(std::log2(elements))); + int grid_size = elements / block_size; + grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; + TransposeNormalKernel<<>>( + in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank); +} // define transpose normal -#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal +#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ + template struct TransposeNormal; \ + template struct TransposeNormal DEFINE_GPU_TRANS_NORMAL(float16); DEFINE_GPU_TRANS_NORMAL(bfloat16); @@ -198,8 +204,8 @@ DEFINE_GPU_TRANS_NORMAL(bool); DEFINE_GPU_TRANS_NORMAL(int16_t); DEFINE_GPU_TRANS_NORMAL(uint8_t); DEFINE_GPU_TRANS_NORMAL(int8_t); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); +DEFINE_GPU_TRANS_NORMAL(pten::dtype::complex); +DEFINE_GPU_TRANS_NORMAL(pten::dtype::complex); struct TensorSetConstantGPU { TensorSetConstantGPU(const paddle::platform::DeviceContext& context, @@ -374,7 +380,7 @@ struct ElementwiseAddTo { }; template struct ElementwiseAddTo; + pten::dtype::float16>; } // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.cc b/paddle/pten/kernels/funcs/transpose.cc deleted file mode 100644 index 7d4dc3c7ce8f0..0000000000000 --- a/paddle/pten/kernels/funcs/transpose.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/pten/kernels/funcs/transpose.h" -#include "paddle/pten/backends/cpu/cpu_context.h" -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/pten/common/bfloat16.h" -#include "paddle/pten/common/complex.h" -#include "paddle/pten/common/float16.h" - -namespace pten { -namespace math { - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const CPUContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = pten::framework::stride(in.dims()); - auto out_stride = pten::framework::stride(out->dims()); - const T* in_ptr = in.data(); - T* out_ptr = dev_ctx.template Alloc(out); - - auto transpose_helper = [&](int64_t beg, int64_t end) { - for (int64_t out_idx = beg; out_idx < end; ++out_idx) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - // calculate the input index - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride[i]; - tmp_idx -= coordinate * out_stride[i]; - in_idx += coordinate * in_stride[axis[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; - } - }; - transpose_helper(0, out->numel()); - } -}; - -// define transpose normal -#define DEFINE_CPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal - -DEFINE_CPU_TRANS_NORMAL(bool); -DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(uint8_t); -DEFINE_CPU_TRANS_NORMAL(int16_t); -DEFINE_CPU_TRANS_NORMAL(int32_t); -DEFINE_CPU_TRANS_NORMAL(int64_t); -DEFINE_CPU_TRANS_NORMAL(float); -DEFINE_CPU_TRANS_NORMAL(double); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex); - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.cu b/paddle/pten/kernels/funcs/transpose.cu deleted file mode 100644 index 09baa2c6e023a..0000000000000 --- a/paddle/pten/kernels/funcs/transpose.cu +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/memory/memcpy.h" -#include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/funcs/transpose.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/pten/common/bfloat16.h" -#include "paddle/pten/common/complex.h" -#include "paddle/pten/common/float16.h" - -namespace pten { - -namespace math { - -#define REINTERPRET(T, DST_PTR, SRC_PTR) \ - T* DST_PTR = reinterpret_cast(SRC_PTR) - -template -__global__ void TransposeNormalKernel(const T* in_ptr, - T* out_ptr, - int64_t element, - const int64_t* in_stride_ptr, - const int64_t* out_stride_ptr, - const int64_t* axis_ptr, - int rank) { - CUDA_KERNEL_LOOP(out_idx, element) { - int64_t in_idx = 0; - int64_t tmp_idx = out_idx; - for (int i = 0; i < rank; ++i) { - const int64_t coordinate = tmp_idx / out_stride_ptr[i]; - tmp_idx -= coordinate * out_stride_ptr[i]; - in_idx += coordinate * in_stride_ptr[axis_ptr[i]]; - } - out_ptr[out_idx] = in_ptr[in_idx]; - } -} - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const GPUContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis) { - const int rank = axis.size(); - auto in_stride = pten::framework::stride(in.dims()); - auto out_stride = pten::framework::stride(out->dims()); - auto* in_ptr = in.data(); - T* out_ptr = dev_ctx.template Alloc(out); - - // copy in_stride, out_stride, axis to gpu device - const paddle::platform::CUDAPlace& cuda_place = dev_ctx.GetPlace(); - paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace(); - size_t size = 3 * rank * sizeof(int64_t); - auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); - auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); - REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); - REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); - for (int i = 0; i < rank; ++i) { - cpu_buf[i] = in_stride[i]; - cpu_buf[rank + i] = out_stride[i]; - cpu_buf[2 * rank + i] = axis[i]; - } - paddle::memory::Copy( - cuda_place, cuda_buf, cpu_place, cpu_buf, size, dev_ctx.stream()); - REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); - REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); - REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); - - const int MAX_BLOCK_DIM = dev_ctx.GetMaxThreadsPerBlock(); - const int MAX_GRID_DIM = - dev_ctx.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; - int64_t elements = in.numel(); - int block_size = (elements >= MAX_BLOCK_DIM) - ? MAX_BLOCK_DIM - : (1 << static_cast(std::log2(elements))); - int grid_size = elements / block_size; - grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; - TransposeNormalKernel<<>>( - in_ptr, - out_ptr, - elements, - in_stride_ptr, - out_stride_ptr, - axis_ptr, - rank); - } -}; - -// define transpose normal -#define DEFINE_GPU_TRANS_NORMAL(TYPE) \ - template struct TransposeNormal - -DEFINE_GPU_TRANS_NORMAL(bool); -DEFINE_GPU_TRANS_NORMAL(int8_t); -DEFINE_GPU_TRANS_NORMAL(uint8_t); -DEFINE_GPU_TRANS_NORMAL(int16_t); -DEFINE_GPU_TRANS_NORMAL(int32_t); -DEFINE_GPU_TRANS_NORMAL(int64_t); -DEFINE_GPU_TRANS_NORMAL(float); -DEFINE_GPU_TRANS_NORMAL(double); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::float16); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::bfloat16); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); -DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/funcs/transpose.h b/paddle/pten/kernels/funcs/transpose.h deleted file mode 100644 index 0cb2b4289fe6e..0000000000000 --- a/paddle/pten/kernels/funcs/transpose.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/pten/core/ddim.h" -#include "paddle/pten/core/dense_tensor.h" - -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/pten/kernels/funcs/eigen/common.h" - -namespace pten { - -namespace math { - -template -struct TransposeNormal { - // for dims >= 7 situation - void operator()(const DeviceContext& dev_ctx, - const pten::DenseTensor& in, - pten::DenseTensor* out, - const std::vector& axis); -}; - -template -struct Transpose { - void operator()(const DeviceContext& dev_ctx, - const DenseTensor& in, - DenseTensor* out, - const std::vector& axis) { - Eigen::array permute; - for (int i = 0; i < Rank; i++) { - permute[i] = axis[i]; - } - auto eigen_in = pten::EigenTensor::From(in); - auto eigen_out = pten::EigenTensor::From(*out); - auto* dev = dev_ctx.eigen_device(); - // use 32bit index to speed up computation - bool use_32bit_index = eigen_out.size() < Eigen::NumTraits::highest(); - bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); - if (use_32bit_index && is_gpu_place) { - To32BitIndex(eigen_out).device(*dev) = - To32BitIndex(eigen_in).shuffle(permute); - } else { - eigen_out.device(*dev) = eigen_in.shuffle(permute); - } - } -}; - -} // namespace math -} // namespace pten diff --git a/paddle/pten/kernels/gpu/cast_kernel.cu b/paddle/pten/kernels/gpu/cast_kernel.cu index 8fba5bc6fba65..006b4f3687c44 100644 --- a/paddle/pten/kernels/gpu/cast_kernel.cu +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -72,16 +72,16 @@ void CastKernel(const Context& dev_ctx, int16_t, \ bool, \ uint8_t, \ - paddle::platform::float16, \ - paddle::platform::complex, \ - paddle::platform::complex, \ + pten::dtype::float16, \ + pten::dtype::complex, \ + pten::dtype::complex, \ ##__VA_ARGS__) { \ kernel->OutputAt(0).SetDataType( \ paddle::experimental::DataType::UNDEFINED); \ } #if !defined(PADDLE_WITH_HIP) -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16) +PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, pten::dtype::bfloat16) #else PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) #endif diff --git a/paddle/pten/kernels/gpu/complex_kernel.cu b/paddle/pten/kernels/gpu/complex_kernel.cu index cd9c95de2ab01..f52159b987b4f 100644 --- a/paddle/pten/kernels/gpu/complex_kernel.cu +++ b/paddle/pten/kernels/gpu/complex_kernel.cu @@ -25,9 +25,9 @@ PT_REGISTER_KERNEL(conj, GPU, ALL_LAYOUT, pten::ConjKernel, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex, + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex, float, double, int, diff --git a/paddle/pten/kernels/gpu/concat_kernel.cu b/paddle/pten/kernels/gpu/concat_kernel.cu index 093af0d54f6eb..784812861fd18 100644 --- a/paddle/pten/kernels/gpu/concat_kernel.cu +++ b/paddle/pten/kernels/gpu/concat_kernel.cu @@ -120,7 +120,7 @@ PT_REGISTER_KERNEL(concat, int64_t, int, uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/dot_grad_kernel.cu b/paddle/pten/kernels/gpu/dot_grad_kernel.cu index 90c37ea1b0061..a371daf79157c 100644 --- a/paddle/pten/kernels/gpu/dot_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_grad_kernel.cu @@ -28,5 +28,5 @@ PT_REGISTER_KERNEL(dot_grad, double, int, int64_t, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu index 24bd034fb15a0..7881dfa9b1bda 100644 --- a/paddle/pten/kernels/gpu/dot_kernel.cu +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -49,8 +49,8 @@ void DotKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(dot, GPU, diff --git a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu index f1b3f53b809df..3be02106afef3 100644 --- a/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/elementwise_grad_kernel.cu @@ -128,9 +128,9 @@ PT_REGISTER_KERNEL(add_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_double_grad, GPU, @@ -140,9 +140,9 @@ PT_REGISTER_KERNEL(add_double_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(add_triple_grad, GPU, @@ -152,9 +152,9 @@ PT_REGISTER_KERNEL(add_triple_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_grad, GPU, @@ -164,9 +164,9 @@ PT_REGISTER_KERNEL(subtract_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(subtract_double_grad, GPU, @@ -176,6 +176,6 @@ PT_REGISTER_KERNEL(subtract_double_grad, double, int, int64_t, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/expand_grad_kernel.cu b/paddle/pten/kernels/gpu/expand_grad_kernel.cu index 49f8718c483ce..b4e89de892449 100644 --- a/paddle/pten/kernels/gpu/expand_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/expand_grad_kernel.cu @@ -24,6 +24,6 @@ PT_REGISTER_KERNEL(expand_grad, pten::ExpandGradKernel, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t) {} diff --git a/paddle/pten/kernels/gpu/expand_kernel.cu b/paddle/pten/kernels/gpu/expand_kernel.cu index e0d8536d6ab34..455eb6ef14cb5 100644 --- a/paddle/pten/kernels/gpu/expand_kernel.cu +++ b/paddle/pten/kernels/gpu/expand_kernel.cu @@ -25,7 +25,7 @@ PT_REGISTER_KERNEL(expand, pten::ExpandKernel, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, bool) {} diff --git a/paddle/pten/kernels/gpu/full_kernel.cu b/paddle/pten/kernels/gpu/full_kernel.cu index 6ea1f1282ddc1..7f600fb313472 100644 --- a/paddle/pten/kernels/gpu/full_kernel.cu +++ b/paddle/pten/kernels/gpu/full_kernel.cu @@ -106,9 +106,9 @@ PT_REGISTER_KERNEL(full, int, int64_t, bool, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(full_like, GPU, @@ -119,4 +119,4 @@ PT_REGISTER_KERNEL(full_like, int, int64_t, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 1a549087e4221..387defc9f4186 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -91,9 +91,9 @@ DEFINE_CUDA_ELEMENTWISE_OP(Divide) } // namespace pten -using float16 = paddle::platform::float16; -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using float16 = pten::dtype::float16; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL(add_raw, GPU, diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index 7df99260aa161..306fe5540e372 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -25,10 +25,10 @@ PT_REGISTER_KERNEL(matmul_grad, pten::MatmulGradKernel, float, double, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_double_grad, GPU, @@ -36,9 +36,9 @@ PT_REGISTER_KERNEL(matmul_double_grad, pten::MatmulDoubleGradKernel, float, double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} PT_REGISTER_KERNEL(matmul_triple_grad, GPU, @@ -46,6 +46,6 @@ PT_REGISTER_KERNEL(matmul_triple_grad, pten::MatmulTripleGradKernel, float, double, - paddle::platform::float16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index b365581e949c1..ebb17963ab0df 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -26,7 +26,7 @@ PT_REGISTER_KERNEL(matmul, pten::MatmulKernel, float, double, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) {} + pten::dtype::float16, + pten::dtype::bfloat16, + pten::dtype::complex, + pten::dtype::complex) {} diff --git a/paddle/pten/kernels/gpu/norm_grad_kernel.cu b/paddle/pten/kernels/gpu/norm_grad_kernel.cu index 35701d349ad3c..4c2cc5347d8e6 100644 --- a/paddle/pten/kernels/gpu/norm_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/norm_grad_kernel.cu @@ -117,4 +117,4 @@ PT_REGISTER_KERNEL(norm_grad, pten::NormGradKernel, float, double, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/norm_kernel.cu b/paddle/pten/kernels/gpu/norm_kernel.cu index 6e2ee65231973..66383f53b5853 100644 --- a/paddle/pten/kernels/gpu/norm_kernel.cu +++ b/paddle/pten/kernels/gpu/norm_kernel.cu @@ -130,4 +130,4 @@ PT_REGISTER_KERNEL(norm, pten::NormKernel, float, double, - paddle::platform::float16) {} + pten::dtype::float16) {} diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index c83662c03c7b0..bca8d455623c0 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -1004,15 +1004,14 @@ template class ReduceOp, typename TransformOp> -static - typename std::enable_if::value, - void>::type - CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { auto reducer = ReduceOp(); cub::TransformInputIterator trans_x(x_data, transform); @@ -1048,15 +1047,14 @@ template class ReduceOp, typename TransformOp> -static - typename std::enable_if::value, - void>::type - CubTensorReduceImpl(const Tx* x_data, - Ty* y_data, - const TransformOp& transform, - int reduce_num, - const paddle::platform::Place& place, - gpuStream_t stream) { +static typename std::enable_if::value, + void>::type +CubTensorReduceImpl(const Tx* x_data, + Ty* y_data, + const TransformOp& transform, + int reduce_num, + const paddle::platform::Place& place, + gpuStream_t stream) { PADDLE_THROW(pten::errors::InvalidArgument( "Tx should not be float16 when using cub::DeviceReduce::Reduce().")); } @@ -1099,7 +1097,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx, } config.SetOutputData(y_data, x.place(), &tmp); - constexpr bool kIsTxFP16 = std::is_same::value; + constexpr bool kIsTxFP16 = std::is_same::value; bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16; if (use_cub_reduce) { CubTensorReduceImpl( diff --git a/paddle/pten/kernels/gpu/sign_kernel.cu b/paddle/pten/kernels/gpu/sign_kernel.cu index 2a96ff653035a..d479d6a2b2d51 100644 --- a/paddle/pten/kernels/gpu/sign_kernel.cu +++ b/paddle/pten/kernels/gpu/sign_kernel.cu @@ -21,7 +21,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/common/float16.h" -using float16 = paddle::platform::float16; +using float16 = pten::dtype::float16; PT_REGISTER_KERNEL( sign, GPU, ALL_LAYOUT, pten::SignKernel, float, double, float16) {} diff --git a/paddle/pten/kernels/impl/full_kernel_impl.h b/paddle/pten/kernels/impl/full_kernel_impl.h index 4fbe9f34e5b4d..4c018e34e2ebc 100644 --- a/paddle/pten/kernels/impl/full_kernel_impl.h +++ b/paddle/pten/kernels/impl/full_kernel_impl.h @@ -47,10 +47,9 @@ void FullLikeKernel(const Context& dev_ctx, auto value = val.to(); using CommonType = typename std::common_type< float, - typename std::conditional< - std::is_same::value, - float, - T>::type>::type; + typename std::conditional::value, + float, + T>::type>::type; auto common_type_value = static_cast(value); diff --git a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h index b346acb6e25c6..f84187484b194 100644 --- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h @@ -90,7 +90,7 @@ static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx, DenseTensor output = EmptyLike(dev_ctx, input); output.Resize({in_dims[1], in_dims[0], in_dims[2]}); std::vector axis = {1, 0, 2}; - math::Transpose trans; + funcs::Transpose trans; trans(dev_ctx, input, &output, axis); output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); return output; diff --git a/paddle/pten/kernels/math_kernel.cc b/paddle/pten/kernels/math_kernel.cc index 29a2b48fa7c4f..2356fb34bf1b7 100644 --- a/paddle/pten/kernels/math_kernel.cc +++ b/paddle/pten/kernels/math_kernel.cc @@ -78,8 +78,8 @@ void MultiplyKernel(const Context& dev_ctx, } // namespace pten -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; +using complex64 = ::pten::dtype::complex; +using complex128 = ::pten::dtype::complex; PT_REGISTER_KERNEL( mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {} @@ -91,7 +91,7 @@ PT_REGISTER_KERNEL(sum, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, @@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(mean, float, double, bool, - paddle::platform::float16) {} + pten::dtype::float16) {} PT_REGISTER_KERNEL(sum, GPU, ALL_LAYOUT, @@ -157,7 +157,7 @@ PT_REGISTER_KERNEL(sum, bool, float, double, - paddle::platform::float16, + pten::dtype::float16, int, int64_t, complex64, @@ -172,7 +172,7 @@ PT_REGISTER_KERNEL(add, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(subtract, @@ -183,7 +183,7 @@ PT_REGISTER_KERNEL(subtract, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(divide, @@ -194,7 +194,7 @@ PT_REGISTER_KERNEL(divide, double, int, int64_t, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} PT_REGISTER_KERNEL(multiply, @@ -206,7 +206,7 @@ PT_REGISTER_KERNEL(multiply, int, int64_t, bool, - paddle::platform::float16, + pten::dtype::float16, complex64, complex128) {} #endif diff --git a/paddle/pten/kernels/transfer_layout_kernel.cc b/paddle/pten/kernels/transfer_layout_kernel.cc index c21ab7c304d9e..281d40221dc95 100644 --- a/paddle/pten/kernels/transfer_layout_kernel.cc +++ b/paddle/pten/kernels/transfer_layout_kernel.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/all_context.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/kernels/funcs/transpose.h" +#include "paddle/pten/kernels/funcs/math_function.h" namespace pten { @@ -42,7 +42,7 @@ void CastDataLayout(const Context& dev_ctx, const DenseTensor& x, const std::vector& axis, DenseTensor* out) { - math::Transpose trans4; + funcs::Transpose trans4; trans4(dev_ctx, x, out, axis); } diff --git a/paddle/pten/tests/api/scale_api.h b/paddle/pten/tests/api/scale_api.h index f4f9ec3c84758..5403f5932ea18 100644 --- a/paddle/pten/tests/api/scale_api.h +++ b/paddle/pten/tests/api/scale_api.h @@ -162,7 +162,7 @@ static void ScaleGPU(DataType kernel_dtype, break; } case pten::DataType::FLOAT16: { - pten::ScaleKernel( + pten::ScaleKernel( dev_ctx, x, pten::Scalar(scale), bias, bias_after_scale, dense_out); break; } diff --git a/paddle/pten/tests/kernels/CMakeLists.txt b/paddle/pten/tests/kernels/CMakeLists.txt index 15a1cab5f0dd4..9682e063471df 100644 --- a/paddle/pten/tests/kernels/CMakeLists.txt +++ b/paddle/pten/tests/kernels/CMakeLists.txt @@ -13,3 +13,11 @@ cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils) cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils) cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS pten pten_api_utils) cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils) + +cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) +if(WITH_GPU) + nv_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function) +endif() +if(WITH_ROCM) + hip_test(test_math_function_gpu SRCS test_math_function.cu DEPS math_function) +endif() diff --git a/paddle/pten/kernels/funcs/math_function_test.cc b/paddle/pten/tests/kernels/test_math_function.cc similarity index 99% rename from paddle/pten/kernels/funcs/math_function_test.cc rename to paddle/pten/tests/kernels/test_math_function.cc index 6ef8c6b689d2c..0d53ff6c637ba 100644 --- a/paddle/pten/kernels/funcs/math_function_test.cc +++ b/paddle/pten/tests/kernels/test_math_function.cc @@ -11,9 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/pten/kernels/funcs/math_function.h" + #include "gtest/gtest.h" #include "paddle/fluid/operators/math/blas.h" +#include "paddle/pten/kernels/funcs/math_function.h" + +namespace pten { +namespace tests { template inline paddle::operators::math::BlasT @@ -348,3 +352,6 @@ TEST(math_function, gemm_warp) { GemmWarpTest(8, 5, 6, 1.0, 0.0); GemmWarpTest(8, 5, 6, 2.0, 1.0); } + +} // namespace tests +} // namespace pten diff --git a/paddle/pten/kernels/funcs/math_function_test.cu b/paddle/pten/tests/kernels/test_math_function.cu similarity index 85% rename from paddle/pten/kernels/funcs/math_function_test.cu rename to paddle/pten/tests/kernels/test_math_function.cu index 87f11c47a4433..69ea874408ec0 100644 --- a/paddle/pten/kernels/funcs/math_function_test.cu +++ b/paddle/pten/tests/kernels/test_math_function.cu @@ -11,12 +11,16 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. + #include "gtest/gtest.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/pten/kernels/funcs/math_function.h" -void fill_fp16_data(paddle::platform::float16* in_ptr, +namespace pten { +namespace tests { + +void fill_fp16_data(pten::dtype::float16* in_ptr, size_t size, const std::vector& data) { PADDLE_ENFORCE_EQ( @@ -28,7 +32,7 @@ void fill_fp16_data(paddle::platform::float16* in_ptr, size, data.size())); for (size_t i = 0; i < data.size(); ++i) { - in_ptr[i] = paddle::platform::float16(data[i]); + in_ptr[i] = pten::dtype::float16(data[i]); } } @@ -95,27 +99,26 @@ TEST(math_function, notrans_mul_trans_fp16) { return; } - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({2, 2}, gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - GetBlas(context).MatMul( - input1_gpu, - false, - input2_gpu, - true, - paddle::platform::float16(1), - &out_gpu, - paddle::platform::float16(0)); + GetBlas(context).MatMul(input1_gpu, + false, + input2_gpu, + true, + pten::dtype::float16(1), + &out_gpu, + pten::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - paddle::platform::float16* out_ptr = out.data(); + pten::dtype::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 5); EXPECT_EQ(static_cast(out_ptr[1]), 14); @@ -185,27 +188,26 @@ TEST(math_function, trans_mul_notrans_fp16) { return; } - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu); - out_gpu.mutable_data({3, 3}, gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); - GetBlas(context).MatMul( - input1_gpu, - true, - input2_gpu, - false, - paddle::platform::float16(1), - &out_gpu, - paddle::platform::float16(0)); + GetBlas(context).MatMul(input1_gpu, + true, + input2_gpu, + false, + pten::dtype::float16(1), + &out_gpu, + pten::dtype::float16(0)); paddle::framework::TensorCopySync(out_gpu, cpu_place, &out); - paddle::platform::float16* out_ptr = out.data(); + pten::dtype::float16* out_ptr = out.data(); context.Wait(); EXPECT_EQ(static_cast(out_ptr[0]), 9); EXPECT_EQ(static_cast(out_ptr[1]), 12); @@ -300,37 +302,37 @@ TEST(math_function, gemm_notrans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - paddle::platform::float16* input2_ptr = - input2.mutable_data({3, 4}, cpu_place); + pten::dtype::float16* input2_ptr = + input2.mutable_data({3, 4}, cpu_place); fill_fp16_data( input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); - paddle::platform::float16* input3_ptr = - input3.mutable_data({2, 4}, cpu_place); + pten::dtype::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); - paddle::platform::float16* a = input1_gpu.data(); - paddle::platform::float16* b = input2_gpu.data(); - paddle::platform::float16* c = - input3_gpu.mutable_data(gpu_place); + pten::dtype::float16* a = input1_gpu.data(); + pten::dtype::float16* b = input2_gpu.data(); + pten::dtype::float16* c = + input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(context).GEMM( false, false, m, n, k, - static_cast(1), + static_cast(1), a, 3, b + 1, 4, - static_cast(1), + static_cast(1), c + 1, 4); @@ -429,37 +431,37 @@ TEST(math_function, gemm_trans_cublas_fp16) { int m = 2; int n = 3; int k = 3; - paddle::platform::float16* input1_ptr = - input1.mutable_data({2, 3}, cpu_place); + pten::dtype::float16* input1_ptr = + input1.mutable_data({2, 3}, cpu_place); fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); - paddle::platform::float16* input2_ptr = - input2.mutable_data({4, 3}, cpu_place); + pten::dtype::float16* input2_ptr = + input2.mutable_data({4, 3}, cpu_place); fill_fp16_data( input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); - paddle::platform::float16* input3_ptr = - input3.mutable_data({2, 4}, cpu_place); + pten::dtype::float16* input3_ptr = + input3.mutable_data({2, 4}, cpu_place); fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); paddle::framework::TensorCopySync(input1, gpu_place, &input1_gpu); paddle::framework::TensorCopySync(input2, gpu_place, &input2_gpu); paddle::framework::TensorCopySync(input3, gpu_place, &input3_gpu); - paddle::platform::float16* a = input1_gpu.data(); - paddle::platform::float16* b = input2_gpu.data(); - paddle::platform::float16* c = - input3_gpu.mutable_data(gpu_place); + pten::dtype::float16* a = input1_gpu.data(); + pten::dtype::float16* b = input2_gpu.data(); + pten::dtype::float16* c = + input3_gpu.mutable_data(gpu_place); - GetBlas(context).GEMM( + GetBlas(context).GEMM( false, true, m, n, k, - static_cast(1), + static_cast(1), a, 3, b + 3, 3, - static_cast(1), + static_cast(1), c + 1, 4); @@ -547,3 +549,6 @@ TEST(math_function, gemv) { GemvTest(3, 13, true); GemvTest(3, 13, true); } + +} // namespace tests +} // namespace pten From a3247ab5e6e98a8729ac4a992c8752df527e0ed8 Mon Sep 17 00:00:00 2001 From: TTerror Date: Thu, 17 Feb 2022 17:34:04 +0800 Subject: [PATCH 19/19] refactoring where/where_index/scatter unittests for kunlun, *test=kunlun (#39619) --- .../unittests/xpu/test_refactor_op_xpu.py | 24 +- .../unittests/xpu/test_scatter_op_xpu.py | 233 ++++++++---------- .../unittests/xpu/test_where_index_xpu.py | 120 ++++----- .../tests/unittests/xpu/test_where_op_xpu.py | 85 ++++--- 4 files changed, 218 insertions(+), 244 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py index e7ee89c567f42..9d1a5ca1fbdd6 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_refactor_op_xpu.py @@ -60,10 +60,11 @@ def dynamic_create_class(self): class TestArgsortOp(XPUOpTest): def setUp(self): - self.set_xpu() self.op_type = "argsort" self.place = paddle.XPUPlace(0) + self.__class__.no_need_check_grad = True self.dtype = self.in_type + self.input_shape = (2, 2, 2, 3, 3) self.axis = -1 if not hasattr(self, 'init_axis') else self.init_axis self.descending = False if not hasattr( @@ -94,10 +95,6 @@ def get_output(self): self.x, kind='heapsort', axis=self.axis) self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) - def set_xpu(self): - self.__class__.use_xpu = True - self.__class__.no_need_check_grad = True - def test_check_output(self): self.check_output_with_place(self.place) @@ -110,9 +107,10 @@ def __init__(self): class TestArgsortOp(XPUOpTest): def setUp(self): - self.set_xpu() self.op_type = "argsort" self.place = paddle.XPUPlace(0) + self.__class__.no_need_check_grad = True + self.init_dtype() self.init_inputshape() self.init_axis() @@ -143,10 +141,6 @@ def get_output(self): self.x, kind='heapsort', axis=self.axis) self.sorted_x = np.sort(self.x, kind='heapsort', axis=self.axis) - def set_xpu(self): - self.__class__.use_xpu = True - self.__class__.no_need_check_grad = True - def init_inputshape(self): self.input_shape = (2, 2, 2, 3, 3) @@ -220,11 +214,9 @@ def __init__(self): class TestHuberLossOp(XPUOpTest): def setUp(self): - self.set_xpu() self.op_type = 'huber_loss' self.place = paddle.XPUPlace(0) - - self.init_dtype() + self.dtype = self.in_type self.set_inputs() self.set_attrs() @@ -253,12 +245,6 @@ def set_outputs(self): def set_shape(self): return (100, 1) - def set_xpu(self): - self.__class__.use_xpu = True - - def init_dtype(self): - self.dtype = self.in_type - def test_check_output(self): self.check_output_with_place(self.place) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py index 16b75cd3f0145..68a39f3c00100 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_scatter_op_xpu.py @@ -18,152 +18,125 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest -from op_test_xpu import XPUOpTest + import paddle -import paddle.fluid as fluid -import paddle.fluid.core as core +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper, type_dict_str_to_numpy paddle.enable_static() -class TestScatterOp(XPUOpTest): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 50)).astype("float32") - index_np = np.array([1, 2]).astype("int32") - updates_np = np.random.random((2, 50)).astype("float32") - output_np = np.copy(ref_np) - output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - def set_xpu(self): - self.__class__.use_xpu = True - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - +class XPUTestScatterOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'scatter' + self.use_dynamic_create_class = True -class TestScatterOp0(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) + def dynamic_create_class(self): + base_class = self.TestScatterOp + classes = [] + test_data_case = [] - ref_np = np.ones((3, 3)).astype("float32") - index_np = np.array([1, 2]).astype("int32") - updates_np = np.random.random((2, 3)).astype("float32") + # case1 + ref_np = np.ones((3, 50)) + index_np = np.array([1, 2]) + updates_np = np.random.random((2, 50)) output_np = np.copy(ref_np) output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.attrs = {'overwrite': True} - self.outputs = {'Out': output_np} - - -class TestScatterOp1(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("float32") - zeros_np = np.zeros([2, 3]).astype('float32') - index_np = np.array([1, 1]).astype("int32") - updates_np = np.random.random((2, 3)).astype("float32") - output_np = np.copy(ref_np) - output_np[index_np] = zeros_np - for i in range(0, len(index_np)): - output_np[index_np[i]] += updates_np[i] - self.attrs = {'overwrite': False} - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - -class TestScatterOp2(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("float32") - index_np = np.array([1, 2]).astype("int32") - updates_np = np.random.random((2, 3)).astype("float32") + data_dict = { + 'init_ref_np': ref_np, + 'init_index_np': index_np, + 'init_updates_np': updates_np, + 'init_output_np': output_np, + 'test_name': 'case1' + } + test_data_case.append(data_dict) + + # case2 + ref_np = np.ones((3, 3)) + index_np = np.array([1, 2]) + updates_np = np.random.random((2, 3)) output_np = np.copy(ref_np) output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - -class TestScatterOp3(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("float32") - zeros_np = np.zeros([2, 3]).astype('float32') + data_dict = { + 'init_ref_np': ref_np, + 'init_index_np': index_np, + 'init_updates_np': updates_np, + 'init_output_np': output_np, + 'test_name': 'case2' + } + test_data_case.append(data_dict) + + # case3 + ref_np = np.ones((3, 3)) + zeros_np = np.zeros([2, 3]) index_np = np.array([1, 1]).astype("int32") - updates_np = np.random.random((2, 3)).astype("float32") + updates_np = np.random.randint(low=-1000, high=1000, size=(2, 3)) output_np = np.copy(ref_np) output_np[index_np] = zeros_np for i in range(0, len(index_np)): output_np[index_np[i]] += updates_np[i] - self.attrs = {'overwrite': False} - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - -class TestScatterOp4(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("float32") - index_np = np.array([1, 2]).astype("int64") - updates_np = np.random.random((2, 3)).astype("float32") - output_np = np.copy(ref_np) - output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - -class TestScatterOp5(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("float32") - index_np = np.array([1, 2]).astype("int64") - updates_np = np.random.random((2, 3)).astype("float32") - output_np = np.copy(ref_np) - output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - - -class TestScatterOp6(TestScatterOp): - def setUp(self): - self.set_xpu() - self.op_type = "scatter" - self.place = paddle.XPUPlace(0) - - ref_np = np.ones((3, 3)).astype("int64") - index_np = np.array([1, 2]).astype("int64") - updates_np = np.random.random((2, 3)).astype("int64") - output_np = np.copy(ref_np) - output_np[index_np] = updates_np - self.inputs = {'X': ref_np, 'Ids': index_np, 'Updates': updates_np} - self.outputs = {'Out': output_np} - + data_dict = { + 'init_ref_np': ref_np, + 'init_index_np': index_np, + 'init_updates_np': updates_np, + 'init_output_np': output_np, + 'test_name': 'case3' + } + test_data_case.append(data_dict) + + for data_dict in test_data_case: + for index_type in ['int32', 'int64']: + for overwrite in [True, False]: + class_name = 'XPUTestScatterOp_index_type_' + data_dict[ + 'test_name'] + '_' + str(index_type) + '_' + str( + overwrite) + attr_dict = data_dict + attr_dict['index_type'] = type_dict_str_to_numpy[index_type] + attr_dict['init_overwrite'] = overwrite + classes.append([class_name, attr_dict]) + return base_class, classes + + class TestScatterOp(XPUOpTest): + def setUp(self): + self.init_config() + self.index_type = np.int32 if not hasattr( + self, 'index_type') else self.index_type + self.overwrite = True if not hasattr( + self, 'init_overwrite') else self.init_overwrite + + if not hasattr(self, 'init_ref_np'): + self.ref_np = np.ones((3, 50)).astype(self.dtype) + self.index_np = np.array([1, 2]).astype(self.index_type) + self.updates_np = np.random.random((2, 50)).astype(self.dtype) + self.output_np = np.copy(self.ref_np) + self.output_np[self.index_np] = self.updates_np + else: + self.ref_np = self.init_ref_np.astype(self.dtype) + self.index_np = self.init_index_np.astype(self.index_type) + self.updates_np = self.init_updates_np.astype(self.dtype) + self.output_np = self.init_output_np.astype(self.dtype) + + self.inputs = { + 'X': self.ref_np, + 'Ids': self.index_np, + 'Updates': self.updates_np + } + self.attrs = {'overwrite': self.overwrite} + self.outputs = {'Out': self.output_np} + + def init_config(self): + self.op_type = "scatter" + self.place = paddle.XPUPlace(0) + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +support_types = get_xpu_op_support_types('scatter') +for stype in support_types: + create_test_class(globals(), XPUTestScatterOp, stype) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py index 69b4f5a03ed18..9c86286d3d8c2 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_where_index_xpu.py @@ -16,71 +16,77 @@ import numpy as np import unittest -import paddle import sys sys.path.append("..") -from op_test import OpTest -from op_test_xpu import XPUOpTest -from paddle.fluid.op import Operator + +import paddle import paddle.fluid as fluid from paddle.fluid import Program, program_guard -paddle.enable_static() - - -class TestWhereIndexOp(XPUOpTest): - def setUp(self): - self.set_xpu() - self.op_type = "where_index" - self.place = paddle.XPUPlace(0) - self.init_config() - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad(self): - pass - - def init_config(self): - self.inputs = {'Condition': np.array([True, False, True]), } - self.outputs = {'Out': np.array([[0], [2]], dtype='int64')} - - def set_xpu(self): - self.__class__.use_xpu = True - - -class TestNotBool(TestWhereIndexOp): - def init_config(self): - self.inputs = {'Condition': np.array([1, 0, 8]), } - - self.outputs = {'Out': np.array([[0], [2]], dtype='int64')} - - -class TestAllFalse(TestWhereIndexOp): - def init_config(self): - self.inputs = {'Condition': np.array([False, False, False]), } - self.outputs = {'Out': np.array([], dtype='int64')} - - -class TestRank2(TestWhereIndexOp): - def init_config(self): - self.inputs = {'Condition': np.array([[True, False], [False, True]]), } - self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')} +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper +paddle.enable_static() -class TestRank3(TestWhereIndexOp): - def init_config(self): - self.inputs = { - 'Condition': np.array([[[True, False], [False, True]], - [[False, True], [True, False]], - [[False, False], [False, True]]]), - } - self.outputs = { - 'Out': np.array( - [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]], - dtype='int64') - } +class XPUTestWhereIndexOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'where_index' + + class TestWhereIndexOp(XPUOpTest): + def setUp(self): + self.init_config() + self.init_data() + + def test_check_output(self): + self.check_output_with_place(self.place) + + def init_data(self): + self.inputs = { + 'Condition': np.array([True, False, True]).astype(self.dtype), + } + self.outputs = {'Out': np.array([[0], [2]], dtype='int64')} + + def init_config(self): + self.op_type = "where_index" + self.place = paddle.XPUPlace(0) + self.dtype = self.in_type + self.__class__.no_need_check_grad = True + + class TestAllFalse(TestWhereIndexOp): + def init_data(self): + self.inputs = { + 'Condition': np.array([False, False, False]).astype(self.dtype), + } + self.outputs = {'Out': np.array([], dtype='int64')} + + class TestRank2(TestWhereIndexOp): + def init_data(self): + self.inputs = { + 'Condition': + np.array([[True, False], [False, True]]).astype(self.dtype), + } + self.outputs = {'Out': np.array([[0, 0], [1, 1]], dtype='int64')} + + class TestRank3(TestWhereIndexOp): + def init_data(self): + self.inputs = { + 'Condition': + np.array([[[True, False], [False, True]], + [[False, True], [True, False]], + [[False, False], [False, True]]]).astype(self.dtype), + } + + self.outputs = { + 'Out': np.array( + [[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 0], [2, 1, 1]], + dtype='int64') + } + + +support_types = get_xpu_op_support_types('where_index') +for stype in support_types: + create_test_class(globals(), XPUTestWhereIndexOp, stype) class TestWhereOpError(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py index 2161ec24dbf87..461b56ff0d8a8 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_where_op_xpu.py @@ -18,52 +18,61 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest -from op_test_xpu import XPUOpTest + import paddle import paddle.fluid as fluid from paddle.fluid import Program from paddle.fluid.backward import append_backward -paddle.enable_static() - - -class TestXPUWhereOp(XPUOpTest): - def setUp(self): - self.op_type = "where" - self.set_xpu() - self.init_config() - self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y} - self.outputs = {'Out': np.where(self.cond, self.x, self.y)} - - def init_config(self): - self.x = np.random.uniform(-3, 5, (100)).astype("float32") - self.y = np.random.uniform(-3, 5, (100)).astype("float32") - self.cond = np.zeros((100)).astype("bool") - - def set_xpu(self): - self.__class__.use_xpu = True - self.place = paddle.XPUPlace(0) - - def test_check_output(self): - self.check_output_with_place(self.place) - - def test_check_grad_normal(self): - self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') - +from op_test_xpu import XPUOpTest +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper -class TestXPUWhereOp2(TestXPUWhereOp): - def init_config(self): - self.x = np.random.uniform(-5, 5, (60, 2)).astype("float32") - self.y = np.random.uniform(-5, 5, (60, 2)).astype("float32") - self.cond = np.ones((60, 2)).astype("bool") +paddle.enable_static() -class TestXPUWhereOp3(TestXPUWhereOp): - def init_config(self): - self.x = np.random.uniform(-3, 5, (20, 2, 4)).astype("float32") - self.y = np.random.uniform(-3, 5, (20, 2, 4)).astype("float32") - self.cond = np.array(np.random.randint(2, size=(20, 2, 4)), dtype=bool) +class XPUTestWhereOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'where' + + class TestXPUWhereOp(XPUOpTest): + def setUp(self): + self.init_config() + self.init_data() + self.inputs = {'Condition': self.cond, 'X': self.x, 'Y': self.y} + self.outputs = {'Out': np.where(self.cond, self.x, self.y)} + + def init_data(self): + self.x = np.random.uniform(-3, 5, (100)).astype(self.dtype) + self.y = np.random.uniform(-3, 5, (100)).astype(self.dtype) + self.cond = np.zeros((100)).astype("bool") + + def init_config(self): + self.op_type = "where" + self.dtype = self.in_type + self.place = paddle.XPUPlace(0) + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestXPUWhereOp2(TestXPUWhereOp): + def init_data(self): + self.x = np.random.uniform(-5, 5, (60, 2)).astype(self.dtype) + self.y = np.random.uniform(-5, 5, (60, 2)).astype(self.dtype) + self.cond = np.ones((60, 2)).astype("bool") + + class TestXPUWhereOp3(TestXPUWhereOp): + def init_data(self): + self.x = np.random.uniform(-3, 5, (20, 2, 4)).astype(self.dtype) + self.y = np.random.uniform(-3, 5, (20, 2, 4)).astype(self.dtype) + self.cond = np.array( + np.random.randint( + 2, size=(20, 2, 4)), dtype=bool) + + +support_types = get_xpu_op_support_types('where') +for stype in support_types: + create_test_class(globals(), XPUTestWhereOp, stype) class TestXPUWhereAPI(unittest.TestCase):