From 36660d4c356d4c6b71eb8df51e094ea36bfa2c06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Wed, 2 Mar 2022 14:02:42 +0800 Subject: [PATCH 01/41] [infrt] speed up the infrt ci. test=devvelop (#40032) --- paddle/scripts/infrt_build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index 8d858647ea63d..a0132501387e0 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -102,9 +102,11 @@ function infrt_gen_and_build() { function create_fake_models() { cd ${PADDLE_ROOT}/build + cd python/dist/ # create multi_fc model, this will generate "multi_fc_model" python3 -m pip uninstall -y paddlepaddle - python3 -m pip install paddlepaddle + python3 -m pip install *whl + cd ${PADDLE_ROOT}/build python3 ${PADDLE_ROOT}/tools/infrt/fake_models/multi_fc.py } From 9070d5c5d85e15a04324b6a5f2f1e2c9a7ecc1b6 Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Wed, 2 Mar 2022 14:08:19 +0800 Subject: [PATCH 02/41] test=document_fix;record py3 case time (#40018) --- paddle/scripts/paddle_build.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9bef7e1285128..ed70a8638bf73 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -776,7 +776,9 @@ set +x tmpfile=$tmp_dir/$tmpfile_rand ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile fi - + ut_total_endTime_s=`date +%s` + echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s" + collect_failed_tests rm -f $tmp_dir/* exec_times=0 From b4d931e8bce97a12e9ac7a12ff6c0a11499002c7 Mon Sep 17 00:00:00 2001 From: qipengh Date: Wed, 2 Mar 2022 14:23:35 +0800 Subject: [PATCH 03/41] [MLU] adapt matmul op (#39727) * [MLU] adapt matmul op * [MLU] fix phi namespace --- paddle/fluid/imperative/CMakeLists.txt | 6 +- paddle/fluid/operators/matmul_op_mlu.cc | 337 ++++++++++++++++++ .../tests/unittests/mlu/test_matmul_op_mlu.py | 329 +++++++++++++++++ 3 files changed, 671 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/matmul_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index f198919b0c87b..e1ce705533ab4 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -46,8 +46,12 @@ if(WITH_GLOO) endif() endif() +if(WITH_MLU) + SET(MLU_DEPS mlu_baseop) +endif() + if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor ${MLU_DEPS}) else() cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) endif() diff --git a/paddle/fluid/operators/matmul_op_mlu.cc b/paddle/fluid/operators/matmul_op_mlu.cc new file mode 100644 index 0000000000000..d0c84c4751e78 --- /dev/null +++ b/paddle/fluid/operators/matmul_op_mlu.cc @@ -0,0 +1,337 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +static void Mul(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out), ToCnnlDataType(), alpha); +} + +template +static void MatMul2D(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::Matmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + if (!Out->initialized()) { + Out->mutable_data(ctx.GetPlace()); + } + + PADDLE_ENFORCE_LT(fabs(alpha - 1.0), std::numeric_limits::epsilon(), + platform::errors::InvalidArgument( + "MLU(matmul): alpha should be equal to 1.0! " + "Other values are not supported yet." + "But received alpha is %d.", + alpha)); + + MLUCnnlTensorDesc x_desc(X, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(Y, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*Out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + MLUCnnl::BatchMatmul(ctx, trans_x, trans_y, x_desc.get(), GetBasePtr(&X), + y_desc.get(), GetBasePtr(&Y), out_desc.get(), + GetBasePtr(Out)); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const std::vector& dims, + const std::vector& bcast_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = bcast_dims.size(); + int64_t diff = bcast_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (bcast_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc in_desc(in, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out, CNNL_LAYOUT_ARRAY, ToCnnlDataType()); + + std::vector reduce_dims(axes.begin(), axes.end()); + MLUCnnlReduceDesc reduce_desc(reduce_dims, CNNL_REDUCE_ADD, + ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN, + CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduce_desc.get(), nullptr, + in_desc.get(), GetBasePtr(&in), 0 /*indices_size*/, nullptr, + nullptr, out_desc.get(), GetBasePtr(out)); +} + +template +class MatMulMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + + // Case 1: [K] x [K] = [1] + // Equal: [1, K] x [K, 1] = [1, 1] => [1] + const bool all_one_dim = (x_ndim == 1 && y_ndim == 1); + if (all_one_dim) { + Out->Resize({1, 1}); + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + x_temp.Resize(phi::make_ddim(x_dims)); + x_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < y_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.insert(temp_out_dims.end() - 1, 1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + if (y_ndim == 1) { + y_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + y_ndim = 2; + // matmul op of mlu needs `std::max(x->dim, y->dim) == out->dim` + if (out_dims.size() < x_dims.size()) { + std::vector temp_out_dims(out_dims.begin(), out_dims.end()); + temp_out_dims.push_back(1); + Out->Resize(phi::make_ddim(temp_out_dims)); + } + } + + const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (transpose_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); + } + + if (x_ndim == 2 && y_ndim == 2) { + // Case 2: [M, K] x [K, N] = [M, N] + MatMul2D(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } else { + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + MatMulND(ctx, x_temp, y_temp, Out, transpose_x, transpose_y, alpha); + } + + if (phi::vectorize(Out->dims()) != out_dims) { + Out->Resize(phi::make_ddim(out_dims)); + } + } +}; + +template +class MatMulGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = phi::vectorize(X->dims()); + std::vector y_dims = phi::vectorize(Y->dims()); + std::vector out_dims = phi::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); + + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + if (dX) { + Mul(ctx, *dOut, *Y, dX, alpha); + } + if (dY) { + Mul(ctx, *dOut, *X, dY, alpha); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(phi::make_ddim(x_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(phi::make_ddim(y_dims)); + dout_temp.Resize(phi::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(phi::make_ddim(x_dims)); + if (transpose_x) { + MatMul2D(ctx, y_temp, dout_temp, dX, transpose_y, true, alpha); + } else { + MatMul2D(ctx, dout_temp, y_temp, dX, false, !transpose_y, alpha); + } + dX->Resize(X->dims()); + } + if (dY) { + dY->Resize(phi::make_ddim(y_dims)); + if (transpose_y) { + MatMul2D(ctx, dout_temp, x_temp, dY, true, transpose_x, alpha); + } else { + MatMul2D(ctx, x_temp, dout_temp, dY, !transpose_x, false, alpha); + } + dY->Resize(Y->dims()); + } + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N] + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_bcast_dims(out_ndim, 1); + std::vector y_bcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_bcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_bcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_bcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_bcast_dims.end() - 2); + + if (dX) { + Tensor dx_temp(X->type()); + if (x_dims != x_bcast_dims) { + dx_temp.Resize(phi::make_ddim(x_bcast_dims)); + } else { + dX->mutable_data(ctx.GetPlace()); + dx_temp.ShareDataWith(*dX); + } + + if (transpose_x) { + MatMulND(ctx, y_temp, dout_temp, &dx_temp, transpose_y, true, alpha); + } else { + MatMulND(ctx, dout_temp, y_temp, &dx_temp, false, !transpose_y, + alpha); + } + + if (x_dims != x_bcast_dims) { + ReduceDims(ctx, x_dims, x_bcast_dims, dx_temp, dX); + } + } + + if (dY) { + Tensor dy_temp(Y->type()); + if (y_dims != y_bcast_dims) { + dy_temp.Resize(phi::make_ddim(y_bcast_dims)); + } else { + dY->mutable_data(ctx.GetPlace()); + dy_temp.ShareDataWith(*dY); + } + + if (transpose_y) { + MatMulND(ctx, dout_temp, x_temp, &dy_temp, true, transpose_x, alpha); + } else { + MatMulND(ctx, x_temp, dout_temp, &dy_temp, !transpose_x, false, + alpha); + } + + if (y_dims != y_bcast_dims) { + ReduceDims(ctx, y_dims, y_bcast_dims, dy_temp, dY); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(matmul, ops::MatMulMLUKernel, + ops::MatMulMLUKernel); +REGISTER_OP_MLU_KERNEL(matmul_grad, ops::MatMulGradMLUKernel, + ops::MatMulGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py new file mode 100644 index 0000000000000..adfff112e6be2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_matmul_op_mlu.py @@ -0,0 +1,329 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2022 + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size, )) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size, )) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if not Out.shape: + # We do not support 0-dimensional Tensors (scalars). So where + # np.matmul outputs a scalar, we must convert to a Tensor of + # shape (1, ) instead. + # Everywhere else, we are compatible with np.matmul. + Out = np.array([Out], dtype="float64") + if abs(scale - 1.0) > 1e-09: + Out = Out * scale + return Out + + +class TestMatMulOp(OpTest): + """ + basic case + """ + + def setUp(self): + self.set_mlu() + self.op_type = "matmul" + self.init_dtype() + self.init_alpha() + self.config() + + X = np.random.random(self.x_shape).astype(self.dtype) + Y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + X = -0.1 + 0.2 * X + Y = -0.1 + 0.2 * Y + + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, + self.alpha) + Out = Out.astype(self.dtype) + self.inputs = {'X': X, 'Y': Y} + self.attrs = { + 'transpose_X': self.transpose_X, + 'transpose_Y': self.transpose_Y, + 'alpha': self.alpha + } + self.outputs = {'Out': Out} + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def config(self): + self.x_shape = (100, ) + self.y_shape = (100, ) + self.transpose_X = False + self.transpose_Y = False + + def init_alpha(self): + self.alpha = 1.0 + + def init_dtype(self): + self.dtype = "float32" + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + +class TestMatMulOp1(TestMatMulOp): + """ + case x_ndim == 1, y_ndim != 1 + """ + + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp2(TestMatMulOp): + """ + case x_ndim != 1, y_ndim == 1 + """ + + def config(self): + self.x_shape = (1, 2, 100, 1) + self.y_shape = (100, ) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp3(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp4(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp5(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (100, 2) + self.y_shape = (100, 2) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp6(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 2, 25) + self.y_shape = (25, 4) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp7(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 2, 25) + self.y_shape = (4, 25) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp8(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 25, 4) + self.y_shape = (25, 4) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp9(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 10, 5) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp10(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 10, 5) + self.y_shape = (2, 10, 5) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp11(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 5, 10) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp12(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp13(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.transpose_X = False + self.transpose_Y = False + + +# TODO(mlu): alpha will be supported in next version +#--------------------test matmul alpha-------------------- +# def create_test_alpha_class(parent): +# class TestMatMulOpAlphaCase(parent): +# def init_alpha(self): +# self.alpha = 0.125 + +# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") +# TestMatMulOpAlphaCase.__name__ = cls_name +# globals()[cls_name] = TestMatMulOpAlphaCase + +# create_test_alpha_class(TestMatMulOp) +# create_test_alpha_class(TestMatMulOp1) +# create_test_alpha_class(TestMatMulOp2) +# create_test_alpha_class(TestMatMulOp3) +# create_test_alpha_class(TestMatMulOp4) +# create_test_alpha_class(TestMatMulOp5) +# create_test_alpha_class(TestMatMulOp6) +# create_test_alpha_class(TestMatMulOp9) +# create_test_alpha_class(TestMatMulOp10) +# create_test_alpha_class(TestMatMulOp11) +# create_test_alpha_class(TestMatMulOp12) +# create_test_alpha_class(TestMatMulOp13) + + +#--------------------test matmul fp16-------------------- +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulOp) +create_test_fp16_class(TestMatMulOp1) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) + +if __name__ == "__main__": + unittest.main() From 0764fda25bb016bf143fc0a3aa93a3fb56b0cd73 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 2 Mar 2022 15:07:34 +0800 Subject: [PATCH 04/41] [Phi] Unify complex type trait and fix real imag bug (#40036) * unify complex type trait and fix real imag bug * add unittest for type tratis --- paddle/fluid/operators/angle_op.h | 6 +- paddle/fluid/operators/eig_op.h | 26 ++-- paddle/fluid/operators/eigh_op.h | 2 +- paddle/fluid/operators/eigvals_op.h | 14 +- paddle/fluid/operators/imag_op.cc | 2 +- paddle/fluid/operators/lstsq_op.h | 4 +- .../operators/math/eigen_values_vectors.h | 8 +- paddle/fluid/operators/math/inclusive_scan.h | 2 +- paddle/fluid/operators/qr_op.cu | 14 +- paddle/fluid/operators/qr_op.h | 18 +-- paddle/fluid/operators/real_op.cc | 2 +- paddle/fluid/operators/svd_helper.h | 12 +- paddle/fluid/operators/svd_op.h | 12 +- paddle/phi/common/type_traits.h | 96 ++++++++++++++ paddle/phi/infermeta/unary.cc | 7 + paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/cpu/abs_kernel.cc | 6 +- paddle/phi/kernels/cpu/complex_kernel.cc | 8 +- paddle/phi/kernels/funcs/complex_functors.h | 123 ++++++------------ paddle/phi/kernels/gpu/abs_kernel.cu | 10 +- paddle/phi/kernels/gpu/complex_kernel.cu | 8 +- .../phi/kernels/impl/abs_grad_kernel_impl.h | 2 +- .../kernels/impl/complex_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 8 +- paddle/phi/tests/common/test_data_type.cc | 16 +++ 25 files changed, 247 insertions(+), 165 deletions(-) create mode 100644 paddle/phi/common/type_traits.h diff --git a/paddle/fluid/operators/angle_op.h b/paddle/fluid/operators/angle_op.h index db5a3ea296194..116a8053db3ed 100644 --- a/paddle/fluid/operators/angle_op.h +++ b/paddle/fluid/operators/angle_op.h @@ -36,8 +36,8 @@ class AngleKernel : public framework::OpKernel { auto numel = x->numel(); auto* x_data = x->data(); - auto* out_data = out->mutable_data>( - context.GetPlace(), size_t(x->numel() * sizeof(phi::funcs::Real))); + auto* out_data = out->mutable_data>( + context.GetPlace(), size_t(x->numel() * sizeof(phi::dtype::Real))); auto& dev_ctx = context.template device_context(); platform::ForRange for_range(dev_ctx, numel); @@ -57,7 +57,7 @@ class AngleGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("X")); auto numel = d_out->numel(); - auto* dout_data = d_out->data>(); + auto* dout_data = d_out->data>(); auto* x_data = x->data(); auto* dx_data = d_x->mutable_data( ctx.GetPlace(), static_cast(numel * sizeof(T))); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index 03b25c6705ac5..e9c6c1eb7eced 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -87,19 +87,19 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, int values_stride = values->dims()[values->dims().size() - 1]; Tensor rwork; - phi::funcs::Real* rwork_data = nullptr; + phi::dtype::Real* rwork_data = nullptr; rwork.Resize(phi::make_ddim({lda * 2})); - rwork_data = rwork.mutable_data>(context.GetPlace()); + rwork_data = rwork.mutable_data>(context.GetPlace()); // call lapackEig once to compute the size of work; T computed_work_size; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); lwork = std::max( - 1, static_cast(phi::funcs::Real(computed_work_size))); + 1, static_cast(phi::dtype::Real(computed_work_size))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); @@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, T* current_values = &values_data[i * values_stride]; T* current_rvectors = &rvector_data[i * matrix_stride]; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); PADDLE_ENFORCE_EQ( @@ -207,23 +207,23 @@ class EigKernel : public framework::OpKernel { origin_dim.push_back(last_item * 2); framework::DDim big_dim = phi::make_ddim(origin_dim); - real_values.mutable_data>(big_dim, + real_values.mutable_data>(big_dim, context.GetPlace()); - real_vectors.mutable_data>(x->dims(), + real_vectors.mutable_data>(x->dims(), context.GetPlace()); - ApplyEigKernel>( + ApplyEigKernel>( *x, &real_values, &real_vectors, context); auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::funcs::Real, Tout>(context); + DeviceContext, phi::dtype::Real, Tout>(context); // 1. extract real part & imag part from real_values Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); // 2. construct complex values - auto* real_part_data = real_part.data>(); - auto* imag_part_data = imag_part.data>(); + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); int out_values_numel = out_values->numel(); platform::ForRange for_range( context.template device_context(), out_values_numel); @@ -236,7 +236,7 @@ class EigKernel : public framework::OpKernel { Tensor real_vector_trans = dito.Transpose(real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); - ConstructComplexVectors, Tout>( + ConstructComplexVectors, Tout>( &out_vectors_trans, *out_values, real_vector_trans, context, batch_count, order); TransposeTwoAxis(out_vectors_trans, out_vectors, @@ -272,7 +272,7 @@ void ComputeBackwardForComplexInput( // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un = diag_unsqueezed.data>(); auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), static_cast(numel * sizeof(Tout))); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h index 294794877b32e..5279ec750935c 100644 --- a/paddle/fluid/operators/eigh_op.h +++ b/paddle/fluid/operators/eigh_op.h @@ -40,7 +40,7 @@ template class EighGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto& x_grad = *ctx.Output(framework::GradVarName("X")); x_grad.mutable_data(ctx.GetPlace()); auto& output_w = *ctx.Input("Eigenvalues"); diff --git a/paddle/fluid/operators/eigvals_op.h b/paddle/fluid/operators/eigvals_op.h index 59eabfb29b97e..4627acc0d07de 100644 --- a/paddle/fluid/operators/eigvals_op.h +++ b/paddle/fluid/operators/eigvals_op.h @@ -48,7 +48,7 @@ struct PaddleComplex< template using PaddleCType = typename PaddleComplex::type; template -using Real = typename phi::funcs::Real; +using Real = typename phi::dtype::Real; static void SpiltBatchSquareMatrix(const Tensor& input, std::vector* output) { @@ -144,7 +144,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_work_mem, work_mem)); int64_t rwork_mem = rwork->memory_size(); - int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::funcs::Real); + int64_t required_rwork_mem = (n_dim << 1) * sizeof(phi::dtype::Real); PADDLE_ENFORCE_GE( rwork_mem, required_rwork_mem, platform::errors::InvalidArgument( @@ -154,11 +154,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, required_rwork_mem, rwork_mem)); int info = 0; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), a.template data(), static_cast(n_dim), output->template data(), NULL, 1, NULL, 1, work->template data(), static_cast(work_mem / sizeof(T)), - rwork->template data>(), &info); + rwork->template data>(), &info); std::string name = "framework::platform::dynload::cgeev_"; if (framework::TransToProtoVarType(input.dtype()) == @@ -188,10 +188,10 @@ class EigvalsKernel : public framework::OpKernel { // query workspace size T qwork; int info; - phi::funcs::lapackEig>( + phi::funcs::lapackEig>( 'N', 'N', static_cast(n_dim), input_matrices[0].template data(), static_cast(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1, - static_cast*>(NULL), &info); + static_cast*>(NULL), &info); int64_t lwork = static_cast(qwork); Tensor work, rwork; @@ -208,7 +208,7 @@ class EigvalsKernel : public framework::OpKernel { } if (framework::IsComplexType( framework::TransToProtoVarType(input->dtype()))) { - rwork.mutable_data>(phi::make_ddim({n_dim << 1}), + rwork.mutable_data>(phi::make_ddim({n_dim << 1}), ctx.GetPlace()); } diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 33b68d68992dd..567a69f383d1c 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace paddle DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); + PT_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index a4c3d1c81fb3e..3cbbc62e7bec9 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -46,7 +46,7 @@ template class LstsqCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; const Tensor& x = *context.Input("X"); auto y = context.Input("Y"); @@ -169,7 +169,7 @@ class LstsqCPUKernel : public framework::OpKernel { &rank_32, &wkopt, lwork, &rwkopt, &info); } - lwork = std::max(1, static_cast(phi::funcs::Real(wkopt))); + lwork = std::max(1, static_cast(phi::dtype::Real(wkopt))); Tensor work; work.Resize(phi::make_ddim({lwork})); T* work_data = work.mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 9b6ebf73d9b09..1ade2190bb96e 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -63,7 +63,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto dito = @@ -123,7 +123,7 @@ struct MatrixEighFunctor { for (auto i = 0; i < batch_size; i++) { auto *value_data = out_value + i * values_stride; auto *input_data = input_vector + i * vector_stride; - phi::funcs::lapackEigh>( + phi::funcs::lapackEigh>( jobz, uplo, n, input_data, lda, value_data, work_data, lwork, rwork_data, lrwork, iwork_data, liwork, &info); CheckEighResult(i, info); @@ -151,7 +151,7 @@ struct MatrixEighFunctor { void operator()(const framework::ExecutionContext &ctx, const Tensor &input, Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower, bool has_vectors) { - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; auto *out_value = eigen_values->mutable_data(ctx.GetPlace()); auto &dev_ctx = ctx.template device_context(); @@ -233,7 +233,7 @@ struct MatrixEighFunctor { } } - using ValueType = phi::funcs::Real; + using ValueType = phi::dtype::Real; inline void EvdBuffer(cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, int n, const T *A, int lda, const ValueType *W, int *lwork) const; diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 38692a646111e..9994ccc10cb13 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -115,7 +115,7 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) { - using RealT = phi::funcs::Real; + using RealT = phi::dtype::Real; constexpr auto kSharedBufferSize = framework::IsComplex::value ? 4 * kThreadNumX : 2 * kThreadNumX; __shared__ RealT sbuf[kThreadNumY][kSharedBufferSize]; diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu index 5e841a097fed7..a57a8d5cf8b7f 100644 --- a/paddle/fluid/operators/qr_op.cu +++ b/paddle/fluid/operators/qr_op.cu @@ -56,13 +56,13 @@ class QrGPUKernel : public framework::OpKernel { int tau_stride = min_mn; if (compute_q) { - q.mutable_data>( + q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - r.mutable_data>( + r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); auto dito = math::DeviceIndependenceTensorOperations { // Note: allocate temporary tensors because of lacking in-place operatios. // Prepare qr Tensor qr; - qr.mutable_data>( + qr.mutable_data>( context.GetPlace(), - size_t(batch_size * m * n * sizeof(phi::funcs::Real))); + size_t(batch_size * m * n * sizeof(phi::dtype::Real))); // BatchedGeqrf performs computation in-place and 'qr' must be a copy of // input paddle::framework::TensorCopy(x, context.GetPlace(), &qr); @@ -126,7 +126,7 @@ class QrGPUKernel : public framework::OpKernel { for (int i = 0; i < batch_size; ++i) { memory::Copy(dev_ctx.GetPlace(), (new_qr_data + i * new_qr_stride), dev_ctx.GetPlace(), (qr_data + i * qr_stride), - qr_stride * sizeof(phi::funcs::Real), + qr_stride * sizeof(phi::dtype::Real), dev_ctx.stream()); } BatchedOrgqr( diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h index cef9371fea099..f09a07e96cd34 100644 --- a/paddle/fluid/operators/qr_op.h +++ b/paddle/fluid/operators/qr_op.h @@ -74,19 +74,19 @@ class QrCPUKernel : public framework::OpKernel { int q_stride = m * k; int r_stride = k * n; - auto* x_data = x.data>(); + auto* x_data = x.data>(); T* q_data = nullptr; if (compute_q) { - q_data = q.mutable_data>( + q_data = q.mutable_data>( context.GetPlace(), - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); memset(q_data, 0, - size_t(batch_size * m * k * sizeof(phi::funcs::Real))); + size_t(batch_size * m * k * sizeof(phi::dtype::Real))); } - auto* r_data = r.mutable_data>( + auto* r_data = r.mutable_data>( context.GetPlace(), - size_t(batch_size * k * n * sizeof(phi::funcs::Real))); - memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::funcs::Real))); + size_t(batch_size * k * n * sizeof(phi::dtype::Real))); + memset(r_data, 0, size_t(batch_size * k * n * sizeof(phi::dtype::Real))); // Implement QR by calling Eigen for (int i = 0; i < batch_size; ++i) { @@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel { // Use a different name dA instead of dX framework::Tensor& dA = *ctx.Output(framework::GradVarName("X")); - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); phi::funcs::SetConstant()(dev_ctx, &dA, T(0)); @@ -224,7 +224,7 @@ class QrGradKernel : public framework::OpKernel { } else { // If m < n for input matrices A, we partition A = [X|Y] and R = [U|V] // Calculate dX and dY individually and concatenate them to get dA - dA.mutable_data>(ctx.GetPlace()); + dA.mutable_data>(ctx.GetPlace()); auto Y = dito.Slice(A, {-1}, {m}, {n}); auto U = dito.Slice(R, {-1}, {0}, {m}); diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 1f3691978b577..28a8484f539fc 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -83,7 +83,7 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace paddle DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); + PT_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index bcb3ee44f0465..166f49999d552 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -105,7 +105,7 @@ struct RealMulComplexFunctor { "The image part of y must to be 0" "but got [%d]", y.imag)); - return platform::complex>(x.real * y.real, + return platform::complex>(x.real * y.real, x.imag * y.real); } }; @@ -391,11 +391,11 @@ struct DeviceIndependenceTensorOperations { // batch_diag for CPU only Tensor BatchDiag(const Tensor& x, int batch) { Tensor out; - auto* x_data = x.data>(); + auto* x_data = x.data>(); auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto x_dims = x.dims(); int num_dims = x_dims.size(); @@ -661,9 +661,9 @@ struct DeviceIndependenceTensorOperations { Tensor Real(const Tensor& x) { Tensor out; auto numel = x.numel(); - auto* out_data = out.mutable_data>( + auto* out_data = out.mutable_data>( x.dims(), context.GetPlace(), - static_cast(numel * sizeof(phi::funcs::Real))); + static_cast(numel * sizeof(phi::dtype::Real))); auto* x_data = x.data(); auto for_range = GetForRange(numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); diff --git a/paddle/fluid/operators/svd_op.h b/paddle/fluid/operators/svd_op.h index f5e451ac7054d..42a847206a3cb 100644 --- a/paddle/fluid/operators/svd_op.h +++ b/paddle/fluid/operators/svd_op.h @@ -46,14 +46,14 @@ class SvdCPUKernel : public framework::OpKernel { int col_u = full ? rows : k; int col_v = full ? cols : k; int batches = numel / (rows * cols); - auto* U_out = U->mutable_data>( + auto* U_out = U->mutable_data>( context.GetPlace(), - size_t(batches * rows * col_u * sizeof(phi::funcs::Real))); - auto* VH_out = VH->mutable_data>( + size_t(batches * rows * col_u * sizeof(phi::dtype::Real))); + auto* VH_out = VH->mutable_data>( context.GetPlace(), - size_t(batches * col_v * cols * sizeof(phi::funcs::Real))); - auto* S_out = S->mutable_data>( - context.GetPlace(), size_t(batches * k * sizeof(phi::funcs::Real))); + size_t(batches * col_v * cols * sizeof(phi::dtype::Real))); + auto* S_out = S->mutable_data>( + context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real))); /*SVD Use the Eigen Library*/ math::BatchSvd(x_data, U_out, VH_out, S_out, rows, cols, batches, full); } diff --git a/paddle/phi/common/type_traits.h b/paddle/phi/common/type_traits.h new file mode 100644 index 0000000000000..ef894eee46835 --- /dev/null +++ b/paddle/phi/common/type_traits.h @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/data_type.h" + +namespace phi { +namespace dtype { + +template +struct cond { + static constexpr bool value = B; + using type = T; +}; + +template +struct eval_if { + using type = typename TrueF::type; +}; + +template +struct eval_if { + using type = typename FalseF::type; +}; + +template +using eval_if_t = typename eval_if::type; + +template +struct select { + using type = eval_if_t>; +}; + +template +struct select { + using type = T; +}; + +template +struct select> { + // last one had better be true! + static_assert(B, "No match select type!"); + using type = T; +}; + +template +using select_t = typename select::type; + +// runtime real and complex type conversion + +template +using Real = select_t>::value, float>, + cond>::value, double>, + T>; + +template +using Complex = select_t::value, complex>, + cond::value, complex>, + T>; + +inline DataType ToReal(DataType dtype) { + switch (dtype) { + case phi::DataType::COMPLEX64: + return phi::DataType::FLOAT32; + case phi::DataType::COMPLEX128: + return phi::DataType::FLOAT64; + default: + return dtype; + } +} + +inline DataType ToComplex(DataType dtype) { + switch (dtype) { + case phi::DataType::FLOAT32: + return phi::DataType::COMPLEX64; + case phi::DataType::FLOAT64: + return phi::DataType::COMPLEX128; + default: + return dtype; + } +} + +} // namespace dtype +} // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 983e016226492..fbd9259a83f86 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/kernels/funcs/unfold_functor.h" @@ -51,6 +52,12 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, out->share_meta(x); } +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(dtype::ToReal(x.dtype())); + out->set_layout(x.layout()); +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a2d779e0f7093..3c0628981af7c 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -39,6 +39,8 @@ void UnchangedInferMetaCheckAxis(const MetaTensor& x, int axis, MetaTensor* out); +void RealAndImagInferMeta(const MetaTensor& x, MetaTensor* out); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index efe7d090405df..9f89fc27a7167 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -25,9 +25,9 @@ template void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - ctx.template Alloc>( - out, size_t(x.numel() * sizeof(phi::funcs::Real))); - auto* out_data = out->data>(); + ctx.template Alloc>( + out, size_t(x.numel() * sizeof(phi::dtype::Real))); + auto* out_data = out->data>(); phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsFunctor functor(x_data, out_data, numel); diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc index 801502e16737d..859d5a84527a2 100644 --- a/paddle/phi/kernels/cpu/complex_kernel.cc +++ b/paddle/phi/kernels/cpu/complex_kernel.cc @@ -37,11 +37,15 @@ PD_REGISTER_KERNEL(real, ALL_LAYOUT, phi::RealKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} PD_REGISTER_KERNEL(imag, CPU, ALL_LAYOUT, phi::ImagKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 86dbdd099ecde..8b292cb5dc52e 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -20,56 +20,12 @@ limitations under the License. */ #include #include "paddle/phi/common/complex.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/hostdevice.h" namespace phi { namespace funcs { -template -struct cond { - static constexpr bool value = B; - using type = T; -}; - -template -struct eval_if { - using type = typename TrueF::type; -}; - -template -struct eval_if { - using type = typename FalseF::type; -}; - -template -using eval_if_t = typename eval_if::type; - -template -struct select { - using type = eval_if_t>; -}; - -template -struct select { - using type = T; -}; - -template -struct select> { - // last one had better be true! - static_assert(B, "No match select type!"); - using type = T; -}; - -template -using select_t = typename select::type; - -template -using Real = - select_t>::value, float>, - cond>::value, double>, - T>; - template using Complex = typename std::enable_if::value>::type; @@ -91,9 +47,9 @@ template struct RealFunctor; template -struct RealFunctor>> { +struct RealFunctor>> { public: - RealFunctor(const T* input, Real* output, int64_t numel) + RealFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -102,7 +58,7 @@ struct RealFunctor>> { private: const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; @@ -110,8 +66,8 @@ template struct ImagFunctor; template -struct ImagFunctor>> { - ImagFunctor(const T* input, Real* output, int64_t numel) +struct ImagFunctor>> { + ImagFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -119,7 +75,7 @@ struct ImagFunctor>> { } const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; @@ -127,8 +83,8 @@ template struct AbsFunctor; template -struct AbsFunctor>> { - AbsFunctor(const T* input, Real* output, int64_t numel) +struct AbsFunctor>> { + AbsFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -136,12 +92,12 @@ struct AbsFunctor>> { } const T* input_; - Real* output_; + dtype::Real* output_; int64_t numel_; }; template -struct AbsFunctor>> { +struct AbsFunctor>> { AbsFunctor(const T* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} @@ -203,7 +159,10 @@ struct AbsGradCUDAFunctor> { template struct AbsGradFunctor { - AbsGradFunctor(const Real* dout, const T* x, T* output, int64_t numel) + AbsGradFunctor(const dtype::Real* dout, + const T* x, + T* output, + int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -214,7 +173,7 @@ struct AbsGradFunctor { } } - const Real* dout_; + const dtype::Real* dout_; const T* x_; T* output_; int64_t numel_; @@ -334,8 +293,8 @@ template struct RealToComplexFunctor; template -struct RealToComplexFunctor>> { - RealToComplexFunctor(const Real* input, T* output, int64_t numel) +struct RealToComplexFunctor>> { + RealToComplexFunctor(const dtype::Real* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -343,7 +302,7 @@ struct RealToComplexFunctor>> { output_[idx].imag = 0; } - const Real* input_; + const dtype::Real* input_; T* output_; int64_t numel_; }; @@ -352,8 +311,8 @@ template struct ImagToComplexFunctor; template -struct ImagToComplexFunctor>> { - ImagToComplexFunctor(const Real* input, T* output, int64_t numel) +struct ImagToComplexFunctor>> { + ImagToComplexFunctor(const dtype::Real* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -361,7 +320,7 @@ struct ImagToComplexFunctor>> { output_[idx].imag = input_[idx]; } - const Real* input_; + const dtype::Real* input_; T* output_; int64_t numel_; }; @@ -370,9 +329,9 @@ template struct RealImagToComplexFunctor; template -struct RealImagToComplexFunctor>> { - RealImagToComplexFunctor(const Real* input_real, - const Real* input_imag, +struct RealImagToComplexFunctor>> { + RealImagToComplexFunctor(const dtype::Real* input_real, + const dtype::Real* input_imag, T* output, int64_t numel) : input_real_(input_real), @@ -385,8 +344,8 @@ struct RealImagToComplexFunctor>> { output_[idx].imag = input_imag_[idx]; } - const Real* input_real_; - const Real* input_imag_; + const dtype::Real* input_real_; + const dtype::Real* input_imag_; T* output_; int64_t numel_; }; @@ -423,8 +382,8 @@ struct AngleFunctor; // angel function for complex template -struct AngleFunctor>> { - AngleFunctor(const T* input, phi::funcs::Real* output, int64_t numel) +struct AngleFunctor>> { + AngleFunctor(const T* input, dtype::Real* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { @@ -432,13 +391,13 @@ struct AngleFunctor>> { } const T* input_; - phi::funcs::Real* output_; + dtype::Real* output_; int64_t numel_; }; // angel function for real template -struct AngleFunctor>> { +struct AngleFunctor>> { AngleFunctor(const T* input, T* output, int64_t numel) : input_(input), output_(output), numel_(numel) {} @@ -456,25 +415,22 @@ struct AngleGradFunctor; // angle grad for complex template -struct AngleGradFunctor>> { - AngleGradFunctor(const phi::funcs::Real* dout, - const T* x, - T* dx, - int64_t numel) +struct AngleGradFunctor>> { + AngleGradFunctor(const dtype::Real* dout, const T* x, T* dx, int64_t numel) : dout_(dout), x_(x), dx_(dx), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { if (x_[idx] == T(0)) { dx_[idx] = T(0); } else { - const phi::funcs::Real r_square = + const phi::dtype::Real r_square = x_[idx].real * x_[idx].real + x_[idx].imag * x_[idx].imag; dx_[idx] = T(-dout_[idx] * x_[idx].imag / r_square, dout_[idx] * x_[idx].real / r_square); } } - const phi::funcs::Real* dout_; + const phi::dtype::Real* dout_; const T* x_; T* dx_; int64_t numel_; @@ -482,16 +438,13 @@ struct AngleGradFunctor>> { // angle grad for real template -struct AngleGradFunctor>> { - AngleGradFunctor(const phi::funcs::Real* dout, - const T* x, - T* dx, - int64_t numel) +struct AngleGradFunctor>> { + AngleGradFunctor(const dtype::Real* dout, const T* x, T* dx, int64_t numel) : dout_(dout), x_(x), dx_(dx), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { dx_[idx] = 0; } - const phi::funcs::Real* dout_; + const dtype::Real* dout_; const T* x_; T* dx_; int64_t numel_; diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index e122e6b1e9c8a..5c424316a83df 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -27,14 +27,14 @@ template struct CudaAbsFunctor; template -struct CudaAbsFunctor>> { - __device__ __forceinline__ phi::funcs::Real operator()(const T x) const { +struct CudaAbsFunctor>> { + __device__ __forceinline__ phi::dtype::Real operator()(const T x) const { return abs(x); } }; template -struct CudaAbsFunctor>> { +struct CudaAbsFunctor>> { __device__ __forceinline__ T operator()(const T x) const { return std::abs(x); } @@ -42,12 +42,12 @@ struct CudaAbsFunctor>> { template void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - ctx.template Alloc>(out); + ctx.template Alloc>(out); std::vector ins = {&x}; std::vector outs = {out}; auto functor = CudaAbsFunctor(); - funcs::ElementwiseKernel>(ctx, ins, &outs, functor); + funcs::ElementwiseKernel>(ctx, ins, &outs, functor); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index d0b086718a444..e03e079581a9b 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -38,11 +38,15 @@ PD_REGISTER_KERNEL(real, ALL_LAYOUT, phi::RealKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} PD_REGISTER_KERNEL(imag, GPU, ALL_LAYOUT, phi::ImagKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 78c25200bbd28..9dad40b57c916 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -47,7 +47,7 @@ void AbsGradKernel(const Context& ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* x_data = x.data(); ctx.template Alloc(dx, static_cast(numel * sizeof(T))); diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h index a10481284b17f..03896a2353dda 100644 --- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h @@ -24,7 +24,7 @@ void RealGradKernel(const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); @@ -38,7 +38,7 @@ void ImagGradKernel(const Context& dev_ctx, const DenseTensor& dout, DenseTensor* dx) { auto numel = dout.numel(); - auto* dout_data = dout.data>(); + auto* dout_data = dout.data>(); auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h index ff5cf86ed2ea2..72b1328833979 100644 --- a/paddle/phi/kernels/impl/complex_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_kernel_impl.h @@ -39,8 +39,8 @@ void RealKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = dev_ctx.template Alloc>( - out, static_cast(numel * sizeof(phi::funcs::Real))); + auto* out_data = dev_ctx.template Alloc>( + out, static_cast(numel * sizeof(phi::dtype::Real))); phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); @@ -53,8 +53,8 @@ void ImagKernel(const Context& dev_ctx, DenseTensor* out) { auto numel = x.numel(); auto* x_data = x.data(); - auto* out_data = dev_ctx.template Alloc>( - out, static_cast(numel * sizeof(phi::funcs::Real))); + auto* out_data = dev_ctx.template Alloc>( + out, static_cast(numel * sizeof(phi::dtype::Real))); phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagFunctor functor(x_data, out_data, numel); diff --git a/paddle/phi/tests/common/test_data_type.cc b/paddle/phi/tests/common/test_data_type.cc index c962c68b4d5f2..5a1b41d796d33 100644 --- a/paddle/phi/tests/common/test_data_type.cc +++ b/paddle/phi/tests/common/test_data_type.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/api/ext/exception.h" #include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/type_traits.h" namespace phi { namespace tests { @@ -71,5 +72,20 @@ TEST(DataType, OStream) { } } +TEST(TypeTraits, Complex) { + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX64), + phi::DataType::FLOAT32); + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::COMPLEX128), + phi::DataType::FLOAT64); + EXPECT_EQ(phi::dtype::ToReal(phi::DataType::FLOAT32), phi::DataType::FLOAT32); + + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT32), + phi::DataType::COMPLEX64); + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::FLOAT64), + phi::DataType::COMPLEX128); + EXPECT_EQ(phi::dtype::ToComplex(phi::DataType::COMPLEX64), + phi::DataType::COMPLEX64); +} + } // namespace tests } // namespace phi From 90ab7403753acad5c93b425f6a909a526aa57a3d Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Wed, 2 Mar 2022 15:11:42 +0800 Subject: [PATCH 05/41] [KP] Activation op registration for XPU2. part 1/2 (#40002) --- .../{activation_op.cu => activation_op.kps} | 64 +++++++++++++++++++ .../platform/device/xpu/xpu_op_kpfirst_list.h | 26 ++++++++ 2 files changed, 90 insertions(+) rename paddle/fluid/operators/{activation_op.cu => activation_op.kps} (94%) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.kps similarity index 94% rename from paddle/fluid/operators/activation_op.cu rename to paddle/fluid/operators/activation_op.kps index e578ad899e74b..e1afb3919f813 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.kps @@ -1861,3 +1861,67 @@ REGISTER_OP_CUDA_KERNEL( __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ CudaHardSwishGradFunctor); FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) + +#ifdef PADDLE_WITH_XPU_KP +#define REGISTER_ACTIVATION_XPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_KERNEL( \ + act_type, KP, plat::XPUPlace, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_KERNEL(act_type##_grad, KP, plat::XPUPlace, \ + ops::ActivationGradCudaKernel>); + +REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor, + CudaReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, + CudaReciprocalGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softplus, Softplus, CudaSoftplusFunctor, + CudaSoftplusGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, HardSwish, CudaHardSwishFunctor, + CudaHardSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(elu, Elu, CudaELUFunctor, CudaELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(celu, Celu, CudaCELUFunctor, + CudaCELUGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, + CudaSqrtGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(square, Square, CudaSquareFunctor, + CudaSquareGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(silu, Silu, CudaSiluFunctor, + CudaSiluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, + CudaLogSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, + CudaSoftShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(ceil, Ceil, CudaCeilFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(floor, Floor, CudaFloorFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(log1p, Log1p, CudaLog1pFunctor, + CudaLog1pGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(brelu, BRelu, CudaBReluFunctor, + CudaBReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(soft_relu, SoftRelu, CudaSoftReluFunctor, + CudaSoftReluGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(softsign, Softsign, CudaSoftsignFunctor, + CudaSoftsignGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(relu6, Relu6, CudaRelu6Functor, + CudaRelu6GradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_shrink, HardShrink, CudaHardShrinkFunctor, + CudaHardShrinkGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(hard_sigmoid, HardSigmoid, + CudaHardSigmoidFunctor, + CudaHardSigmoidGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(swish, Swish, CudaSwishFunctor, + CudaSwishGradFunctor); +REGISTER_ACTIVATION_XPU_KERNEL(thresholded_relu, ThresholdedRelu, + CudaThresholdedReluFunctor, + CudaThresholdedReluGradFunctor); + +#endif // PADDLE_WITH_XPU_KP diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index f79ef8505d878..c5dff84723ccf 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -30,6 +30,32 @@ XPUOpMap& get_kp_ops() { static XPUOpMap s_xpu_kp_kernels{ {"elementwise_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + // activation op + {"exp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"leaky_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softplus", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reciprocal", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"elu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"celu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sqrt", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"square", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"silu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logsigmoid", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softshrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"ceil", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"floor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"log1p", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"brelu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"soft_relu", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"softsign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"relu6", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_shrink", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"hard_sigmoid", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu_kp_kernels; From 244ae318c2fbfea0ab4315a17f6e6296c6be2624 Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Wed, 2 Mar 2022 15:24:36 +0800 Subject: [PATCH 06/41] [fleet_executor] Add entrance of FleetExecutor in AnalysisPredictor for distributed inference (#39992) --- .../distributed/fleet_executor/carrier.cc | 24 +- .../distributed/fleet_executor/carrier.h | 7 +- .../fleet_executor/fleet_executor.cc | 48 ++- .../fleet_executor/fleet_executor.h | 10 +- .../distributed/fleet_executor/task_node.cc | 11 +- .../distributed/fleet_executor/task_node.h | 2 +- paddle/fluid/inference/api/analysis_config.cc | 3 + .../fluid/inference/api/analysis_predictor.cc | 289 +++++++++++++++++- .../fluid/inference/api/analysis_predictor.h | 59 ++++ .../inference/api/paddle_analysis_config.h | 57 ++++ .../fluid/inference/tests/api/CMakeLists.txt | 6 + .../tests/api/analyzer_dist_model_tester.cc | 72 +++++ paddle/fluid/pybind/bind_fleet_executor.cc | 2 +- paddle/fluid/pybind/inference_api.cc | 19 +- python/paddle/fluid/executor.py | 5 +- 15 files changed, 581 insertions(+), 33 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc index 56d8da3eca4b5..0d5d328fd32cc 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.cc +++ b/paddle/fluid/distributed/fleet_executor/carrier.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include + #include "paddle/fluid/distributed/fleet_executor/carrier.h" #include "paddle/fluid/distributed/fleet_executor/global.h" #include "paddle/fluid/distributed/fleet_executor/interceptor.h" @@ -46,7 +48,8 @@ void Carrier::Init( const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place) { + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars) { rank_ = rank; interceptor_id_to_rank_ = interceptor_id_to_rank; interceptor_id_to_node_ = interceptor_id_to_node; @@ -60,7 +63,7 @@ void Carrier::Init( microbatch_scopes_.resize(num_micro_batches); for (int i = 0; i < num_micro_batches; ++i) { microbatch_scopes_[i] = &minibatch_scope_->NewScope(); - CopyParameters(i, program); + CopyParameters(i, program, inference_root_scope_vars); } // TODO(fleet_exe dev): thread pool @@ -80,12 +83,23 @@ void Carrier::Release() { Carrier::~Carrier() { VLOG(3) << "Carrier's destructor."; } -void Carrier::CopyParameters(int microbatch_id, - const framework::ProgramDesc& program) { +void Carrier::CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars) { auto& global_block = program.Block(0); + std::map inference_root_scope_var_map; + for (auto var_name : inference_root_scope_vars) { + inference_root_scope_var_map.insert({var_name, 1}); + } for (auto& var : global_block.AllVars()) { - if (var->Persistable() && microbatch_id == 0) { + std::string var_name = var->Name(); + bool force_root = inference_root_scope_var_map.find(var_name) != + inference_root_scope_var_map.end(); + if (force_root) { + VLOG(4) << var_name << " will be forced to be created in the root scope."; + } + if ((var->Persistable() || force_root) && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(5) << "Create persistable var: " << var->Name() diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h index 9a74fa78c0e76..d35a3260915e2 100644 --- a/paddle/fluid/distributed/fleet_executor/carrier.h +++ b/paddle/fluid/distributed/fleet_executor/carrier.h @@ -57,9 +57,12 @@ class Carrier final { const std::unordered_map& interceptor_id_to_rank, const std::unordered_map& interceptor_id_to_node, const framework::ProgramDesc& program, framework::Scope* scope, - int64_t num_micro_batches, const platform::Place& place); + int64_t num_micro_batches, const platform::Place& place, + const std::vector& inference_root_scope_vars = {}); - void CopyParameters(int microbatch_id, const framework::ProgramDesc& program); + void CopyParameters( + int microbatch_id, const framework::ProgramDesc& program, + const std::vector& inference_root_scope_vars); void Release(); void Wait(); diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc index 457549a27b4b7..e946d78550ff1 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" #include "paddle/fluid/distributed/fleet_executor/global.h" @@ -52,7 +53,8 @@ void FleetExecutor::Init( const std::string& carrier_id, const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank) { + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars) { PADDLE_ENFORCE_GT(task_nodes.size(), 0, platform::errors::InvalidArgument( "Fleet executor is inited with empty task node")); @@ -64,6 +66,37 @@ void FleetExecutor::Init( } } auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {}); + // NOTE: For inference, the vars in inference_root_scope_vars + // shouldn't be deleted during inf, for that they may be the result of the + // inf. If they are GCed, it will cause error during ZeroCopy the result. + std::vector changed_ops; + for (auto pair : unused_vars) { + const framework::OperatorBase* op = pair.first; + std::vector unused = pair.second; + for (auto name : inference_root_scope_vars) { + auto iter = std::find(unused.begin(), unused.end(), name); + if (iter != unused.end()) { + VLOG(3) << "Removing var: [" << name + << "] from the unused vars list of op: [" << op->Type() << "]"; + unused.erase(iter); + if (std::find(changed_ops.begin(), changed_ops.end(), op) == + changed_ops.end()) { + // record the op whose unused vars have been updated + changed_ops.emplace_back(op); + } + } + } + // update the unused vars list in the map + unused_vars[op] = unused; + } + for (auto op : changed_ops) { + auto iter = unused_vars.find(op); + if (iter->second.empty()) { + // remove those ops in the map that have empty unused vars list + VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map."; + unused_vars.erase(iter); + } + } runtime_graph_ = std::make_shared(); std::unordered_map interceptor_id_to_task; for (auto task_node : task_nodes) { @@ -82,17 +115,18 @@ void FleetExecutor::Init( carrier_ids_.insert(carrier_id); // Set current running carrier GlobalVal::Set(new std::string(carrier_id)); - InitCarrier(carrier, scope, place, num_micro_batches, program_desc); + InitCarrier(carrier, scope, place, num_micro_batches, program_desc, + inference_root_scope_vars); GlobalVal::Get()->Barrier(); } -void FleetExecutor::InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, - int64_t num_micro_batches, - const framework::ProgramDesc& program_desc) { +void FleetExecutor::InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars) { carrier->Init(exe_desc_.cur_rank(), runtime_graph_->interceptor_id_to_rank(), runtime_graph_->interceptor_id_to_node(), program_desc, scope, - num_micro_batches, place); + num_micro_batches, place, inference_root_scope_vars); } void FleetExecutor::InitMessageBus() { diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h index fa65309127bec..ccdb3dcc45948 100644 --- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h +++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h @@ -42,15 +42,17 @@ class FleetExecutor final { const framework::ProgramDesc& program_desc, framework::Scope* scope, const platform::Place& place, int64_t num_micro_batches, const std::vector& task_nodes, - const std::unordered_map& task_id_to_rank); + const std::unordered_map& task_id_to_rank, + const std::vector& inference_root_scope_vars = {}); void Run(const std::string& carrier_id); private: DISABLE_COPY_AND_ASSIGN(FleetExecutor); void InitMessageBus(); - void InitCarrier(Carrier* carrier, framework::Scope* scope, - const platform::Place& place, int64_t num_micro_batches, - const framework::ProgramDesc& program_desc); + void InitCarrier( + Carrier* carrier, framework::Scope* scope, const platform::Place& place, + int64_t num_micro_batches, const framework::ProgramDesc& program_desc, + const std::vector& inference_root_scope_vars = {}); FleetExecutorDesc exe_desc_; std::shared_ptr runtime_graph_; std::unordered_set carrier_ids_; diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc index 6de7038b3231f..95e4c73305998 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.cc +++ b/paddle/fluid/distributed/fleet_executor/task_node.cc @@ -52,11 +52,20 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) { program_ = program; } -void TaskNode::Init() { +void TaskNode::Init(bool use_feed_fetch_ops) { + if (!use_feed_fetch_ops) { + VLOG(3) << "TaskNode will be inited without feed and fetch ops"; + } if (ops_.empty()) { // Q (for fleet executor dev): should we need another reset funct? VLOG(3) << "Task node will be inited by calling Init()."; for (const auto& op_desc : program_->Block(0).AllOps()) { + if (!use_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + VLOG(3) << "TaskNode will skip [" << op_desc->Input("X")[0] << "], " + << op_desc->Type() << " -> " << op_desc->Output("Out")[0]; + continue; + } ops_vec_.emplace_back(framework::OpRegistry::CreateOp(*op_desc)); } for (const auto& op : ops_vec_) { diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h index b655d140d37a5..4764d4fd4af87 100644 --- a/paddle/fluid/distributed/fleet_executor/task_node.h +++ b/paddle/fluid/distributed/fleet_executor/task_node.h @@ -46,7 +46,7 @@ class TaskNode final { ~TaskNode() = default; void SetProgram(paddle::framework::ProgramDesc* program); - void Init(); + void Init(bool use_feed_fetch_ops = true); int64_t rank() const { return rank_; } int64_t task_id() const { return task_id_; } int32_t role() const { return role_; } diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index fd2ccffae3b4a..9c33d70030645 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -274,6 +274,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(ipu_available_memory_proportion_); CP_MEMBER(ipu_enable_half_partial_); + // fleet exe related + CP_MEMBER(dist_config_); + if (use_gpu_) { PADDLE_ENFORCE_EQ(use_xpu_, false, platform::errors::InvalidArgument( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cd6e3a3c759c0..5492c3b0d2645 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/version.h" @@ -47,6 +48,14 @@ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/utils/string/split.h" + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h" +#include "paddle/fluid/distributed/fleet_executor/task_node.h" +#endif #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -186,14 +195,14 @@ bool AnalysisPredictor::Init( return false; } + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + // Prepare executor, create local variables. if (!PrepareExecutor()) { return true; } - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); - return true; } @@ -359,6 +368,13 @@ static void DisablePrepareDataOpt( } bool AnalysisPredictor::PrepareExecutor() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "use_dist_model is enabled, will init FleetExecutor."; + return PrepareFleetExecutor(); + } +#endif DisablePrepareDataOpt(inference_program_, 0, false); executor_->Prepare(sub_scope_, *inference_program_, 0, @@ -371,6 +387,226 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +bool AnalysisPredictor::PrepareFleetExecutor() { + VLOG(3) << "AnalysisPredictor::PrepareFleetExecutor()"; + if (config_.dist_config().nranks() > 1 && !CommInit()) { + return false; + } + task_node_.reset(new distributed::TaskNode(inference_program_.get(), + config_.dist_config().rank())); + // With auto cut, there is no concept of pp, no need to add dependency. + task_node_->SetType("Compute"); + task_node_->Init(config_.use_feed_fetch_ops_enabled()); + executor_desc_ = distributed::FleetExecutorDesc(); + executor_desc_.set_cur_rank(config_.dist_config().rank()); + std::unordered_map id_to_rank; + for (int i = 0; i < config_.dist_config().nranks(); ++i) { + distributed::RankInfo *rank_info = executor_desc_.add_cluster_info(); + rank_info->set_rank(i); + rank_info->set_ip_port(config_.dist_config().trainer_endpoints()[i]); + id_to_rank.insert({i, i}); + } + fleet_exe_.reset(new distributed::FleetExecutor(executor_desc_)); + // NOTE: Vars of feed fetch ops are not persistable, + // which will result in that those vars will be created in + // the subscope (microscope) in fleet executor. This will + // cause that the GetInputTensor/GetOutputTensor funct + // in analysis predictor cannot find those vars in the scope + // returned by the DistModel, since DistModel only return the + // root scope. So, those vars must to be created in the root + // scope instead of in the microscope + std::vector feed_fetch_vars; + for (auto pair : idx2feeds_) { + feed_fetch_vars.emplace_back(pair.second); + } + for (auto pair : idx2fetches_) { + feed_fetch_vars.emplace_back(pair.second); + } + fleet_exe_->Init(config_.dist_config().carrier_id(), + *(inference_program_.get()), scope_.get(), place_, 1, + {task_node_.get()}, id_to_rank, feed_fetch_vars); + return true; +} + +bool AnalysisPredictor::CommInit() { + std::map> ring_id_to_ranks{}; + std::map> rank_to_ring_ids{}; + if (!LoadConverterConfig(&ring_id_to_ranks, &rank_to_ring_ids)) { + VLOG(3) << "Load converter config failed, DistModel init failed."; + return false; + } + std::unique_ptr comm_init_program( + new framework::ProgramDesc()); + framework::BlockDesc *comm_init_block = comm_init_program->MutableBlock(0); + std::vector &ring_ids = + rank_to_ring_ids[config_.dist_config().rank()]; + int64_t order = 0; + std::string var_name_base = "comm_init_"; + for (int64_t ring_id : ring_ids) { + VLOG(3) << "Init comm for ring id: " << ring_id; + int64_t ranks_in_group = ring_id_to_ranks[ring_id].size(); + int64_t rank_in_group = 0; + std::vector &ranks = ring_id_to_ranks[ring_id]; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + break; + } + rank_in_group += 1; + } + std::vector peer_endpoints; + for (int64_t rank : ranks) { + if (config_.dist_config().rank() == rank) { + continue; + } + peer_endpoints.emplace_back( + config_.dist_config().trainer_endpoints()[rank]); + } + InsertCommOp(var_name_base + std::to_string(order), ranks_in_group, + rank_in_group, peer_endpoints, comm_init_block, ring_id); + order += 1; + } + framework::NaiveExecutor e(place_); + e.CreateVariables(*comm_init_program, 0, true, scope_.get()); + e.Prepare(scope_.get(), *comm_init_program, 0, false); + e.Run(); + VLOG(3) << "Comm init successful."; + return true; +} + +void AnalysisPredictor::InsertCommOp( + std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, framework::BlockDesc *block, + int ring_id) { + /* + * tmp_var_name: the var name for var comm_id + * nranks: number of total ranks + * rank: the rank of local rank in the comm group + * peer_endpoints: peer's endpoints + * block: the block where to insert the comm ops + * ring_id: the ring_id to be inited + */ + const std::string &endpoint = config_.dist_config().current_endpoint(); + std::stringstream ss; + ss << "Init comm with tmp var: " << tmp_var_name + << ". The ring id is: " << ring_id << ". The group has: " << nranks + << " ranks. Current rank in the group is: " << rank + << ". The endpoint is: " << endpoint << ". Peer endpoints are: "; + for (auto ep : peer_endpoints) { + ss << ep << ", "; + } + VLOG(3) << ss.str(); + if (config_.use_gpu()) { + framework::VarDesc *new_var = block->Var(tmp_var_name); + new_var->SetType(framework::proto::VarType::RAW); + new_var->SetPersistable(true); + framework::OpDesc *gen_nccl_id_op = block->AppendOp(); + gen_nccl_id_op->SetType("c_gen_nccl_id"); + gen_nccl_id_op->SetOutput("Out", {tmp_var_name}); + gen_nccl_id_op->SetAttr("rank", rank); + gen_nccl_id_op->SetAttr("endpoint", + config_.dist_config().current_endpoint()); + gen_nccl_id_op->SetAttr("other_endpoints", peer_endpoints); + gen_nccl_id_op->SetAttr("ring_id", ring_id); + gen_nccl_id_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + gen_nccl_id_op->CheckAttrs(); + framework::OpDesc *comm_init_op = block->AppendOp(); + comm_init_op->SetType("c_comm_init"); + comm_init_op->SetInput("X", {tmp_var_name}); + comm_init_op->SetAttr("rank", rank); + comm_init_op->SetAttr("nranks", nranks); + comm_init_op->SetAttr("ring_id", ring_id); + comm_init_op->SetAttr("op_role", + static_cast(framework::OpRole::kForward)); + comm_init_op->CheckAttrs(); + } else { + LOG(WARNING) << "DistModelInf doesn't init comm."; + // TODO(fleet exe dev): comm init for more devices + } +} + +bool AnalysisPredictor::LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids) { + VLOG(3) << "Going to load converter config from: " + << config_.dist_config().comm_init_config() << "\n"; + std::ifstream fin(config_.dist_config().comm_init_config(), std::ios::in); + PADDLE_ENFORCE_EQ( + static_cast(fin.is_open()), true, + platform::errors::NotFound( + "Cannot open file %s, please confirm whether the file is normal.", + config_.dist_config().comm_init_config())); + std::string line; + bool ring_to_rank{true}; + // Reading config from file, the config file should like these format + // [ring_id -> ranks] + // 0,0,1,2,3 + // 1,0,1 + // 2,2,3 + // 21,0,1 + // 22,1,2 + // 23,2,3 + // [rank -> ring_ids] + // 0,0,1,21 + // 1,0,1,21,22 + // 2,0,2,22,23 + // 3,0,2,23 + while (std::getline(fin, line)) { + std::vector one_line = paddle::string::Split(line, ','); + if (one_line.size() == 1) { + // start a new section of the config + if (line == "[ring_id -> ranks]") { + ring_to_rank = true; + } else if (line == "[rank -> ring_ids]") { + ring_to_rank = false; + } + } else { + // parse key - values pairs in one section + int64_t key = std::stoll(one_line[0]); + for (size_t i = 1; i < one_line.size(); ++i) { + int64_t val = std::stoll(one_line[i]); + if (ring_to_rank) { + if (ring_id_to_ranks->find(key) == ring_id_to_ranks->end()) { + ring_id_to_ranks->insert({key, std::vector()}); + } + ring_id_to_ranks->at(key).emplace_back(val); + } else { + if (rank_to_ring_ids->find(key) == rank_to_ring_ids->end()) { + rank_to_ring_ids->insert({key, std::vector()}); + } + rank_to_ring_ids->at(key).emplace_back(val); + } + // NOTE: add more configuration sections here + } + } + } + std::stringstream ss; + ss << "Loaded the following converter config:\n"; + ss << "ring_id_to_ranks:\n"; + for (auto pair : *ring_id_to_ranks) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + ss << "rank_to_ring_ids:\n"; + for (auto pair : *rank_to_ring_ids) { + int64_t key = pair.first; + ss << "\t" << key << "\t->\t"; + for (auto value : pair.second) { + ss << value << "\t"; + } + ss << "\n"; + } + VLOG(3) << ss.str(); + return true; +} +#endif + void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { #ifdef PADDLE_WITH_MKLDNN std::vector> inputs_shape; @@ -946,13 +1182,24 @@ std::vector AnalysisPredictor::GetOutputNames() { std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "The variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = true; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -985,13 +1232,24 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( std::unique_ptr AnalysisPredictor::GetOutputTensor( const std::string &name) { + framework::Scope *scope; +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + scope = scope_.get(); + } else { + scope = executor_->scope(); + } +#else + scope = executor_->scope(); +#endif PADDLE_ENFORCE_NOT_NULL( - executor_->scope()->FindVar(name), + scope->FindVar(name), platform::errors::PreconditionNotMet( - "he variable named %s is not found in the scope of the exector.", + "The variable named %s is not found in the scope of the executor.", name)); std::unique_ptr res( - new ZeroCopyTensor(static_cast(executor_->scope()))); + new ZeroCopyTensor(static_cast(scope))); res->input_or_output_ = false; res->SetName(name); if (platform::is_cpu_place(place_)) { @@ -1023,6 +1281,18 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( } bool AnalysisPredictor::ZeroCopyRun() { +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + if (config_.dist_config().use_dist_model()) { + VLOG(3) << "ZeroCopyRun will use the fleet executor."; + inference::Timer timer; + timer.tic(); + fleet_exe_->Run(config_.dist_config().carrier_id()); + VLOG(3) << "Fleet executor inf runs once use: " + << std::to_string(timer.toc()) << "ms"; + return true; + } +#endif paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); #ifdef PADDLE_WITH_MKLDNN if (config_.use_mkldnn_) { @@ -1035,7 +1305,6 @@ bool AnalysisPredictor::ZeroCopyRun() { MkldnnPreSet(shape_vector); } #endif - executor_->Run(); if (config_.shape_range_info_collected()) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index a8e56101d37da..8ed183dae0b1b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,6 +18,10 @@ #include #include #include +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/fleet_executor/fleet_executor.h" +#endif #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_compatible_info.h" #include "paddle/fluid/inference/analysis/analyzer.h" @@ -391,6 +395,53 @@ class AnalysisPredictor : public PaddlePredictor { void StatisticShapeRangeInfo(); void CollectShapeRangeInfo(); +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet exe related + + /// + /// \brief prepare for fleet executor to run + /// + /// Used in AnalysisPredictor::Init(), + /// + bool PrepareFleetExecutor(); + + /// + /// \brief init NCCL env for multi gpus inference + /// + /// Used in AnalysisPredictor::PrepareFleetExecutor() + /// + bool CommInit(); + + /// + /// \brief read the config to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] ring_id_to_ranks: a ptr to ring_id_to_ranks + /// \param[in] rank_to_ring_ids: a ptr to rank_to_ring_ids + /// + bool LoadConverterConfig( + std::map> *ring_id_to_ranks, + std::map> *rank_to_ring_ids); + + /// + /// \brief add ops and run them with NaiveExecutor to init NCCL env + /// + /// Used in AnalysisPredictor::CommInit() + /// + /// \param[in] tmp_var_name: var name to hold NCCL unique id + /// \param[in] nranks: number of ranks in one comm group + /// \param[in] rank: relative rank of current rank in the comm group + /// \param[in] peer_endpoints: group's peers' endpoints + /// \param[in] block: the block to insert comm ops + /// \param[in] ring_id: the ring id to be used to init NCCL env + /// + void InsertCommOp(std::string tmp_var_name, int nranks, int rank, + const std::vector &peer_endpoints, + framework::BlockDesc *block, int ring_id); +#endif + private: AnalysisConfig config_; Argument argument_; @@ -436,6 +487,14 @@ class AnalysisPredictor : public PaddlePredictor { std::map>> shape_info_; int clone_num_{1}; + +#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ + !defined(PADDLE_WITH_ASCEND_CL) + // fleet executor related + distributed::FleetExecutorDesc executor_desc_; + std::shared_ptr fleet_exe_; + std::shared_ptr task_node_; +#endif }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 180c028c6a610..b4a358394404f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -76,6 +76,54 @@ struct LiteNNAdapterConfig { LiteNNAdapterConfig& Disable(); }; +struct DistConfig { + bool use_dist_model() const { return use_dist_model_; } + void EnableDistModel(bool use_dist_model) { + use_dist_model_ = use_dist_model; + } + + std::vector trainer_endpoints() const { + return trainer_endpoints_; + } + + std::string current_endpoint() const { return current_endpoint_; } + + void SetEndpoints(const std::vector& trainer_endpoints, + const std::string& current_endpoint) { + trainer_endpoints_ = trainer_endpoints; + current_endpoint_ = current_endpoint; + } + + int64_t nranks() const { return nranks_; } + + int64_t rank() const { return rank_; } + + void SetRanks(int64_t nranks, int64_t rank) { + nranks_ = nranks; + rank_ = rank; + } + + std::string comm_init_config() const { return comm_init_config_; } + + void SetCommInitConfig(const std::string& comm_init_config) { + comm_init_config_ = comm_init_config; + } + + void SetCarrierId(const std::string& carrier_id) { carrier_id_ = carrier_id; } + + std::string carrier_id() const { return carrier_id_; } + + protected: + // DistModel Inference related + bool use_dist_model_{false}; // whether use DistModel or not + std::vector trainer_endpoints_{}; // all trainers' endpoints + std::string current_endpoint_{}; // current trainer's endpoint + int64_t nranks_{1}; // total ranks (number of trainers) + int64_t rank_{0}; // rank + std::string comm_init_config_{}; // converter config path + std::string carrier_id_{"inference"}; +}; + /// /// \brief configuration manager for AnalysisPredictor. /// \since 1.7.0 @@ -763,6 +811,12 @@ struct PD_INFER_DECL AnalysisConfig { LiteNNAdapterConfig& NNAdapter() { return nnadapter_config_; } + void SetDistConfig(const DistConfig& dist_config) { + dist_config_ = dist_config; + } + + const DistConfig& dist_config() const { return dist_config_; } + protected: // Update the config. void Update(); @@ -902,6 +956,9 @@ struct PD_INFER_DECL AnalysisConfig { mutable bool is_valid_{true}; std::string opt_cache_dir_; friend class paddle_infer::experimental::InternalUtils; + + // fleet exe related + DistConfig dist_config_{}; }; } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 0281fd917658a..8c96499a022f7 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -720,6 +720,12 @@ inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zeroco EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) +if(WITH_DISTRIBUTE AND WITH_PSCORE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) + inference_analysis_test(test_analyzer_dist_model SRCS analyzer_dist_model_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${OCR_INSTALL_DIR}/model) +endif() + inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) diff --git a/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc new file mode 100644 index 0000000000000..7cf6e2adfc688 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_dist_model_tester.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { + +TEST(test_dist_model, dist_model) { + std::cout << "Analysis Predictor DistModel test." << std::endl; + AnalysisConfig config; + config.SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + config.SwitchUseFeedFetchOps(false); + config.EnableUseGpu(100, 0); + DistConfig dist_config; + dist_config.SetRanks(1, 0); + dist_config.EnableDistModel(true); + dist_config.SetEndpoints({""}, ""); + config.SetDistConfig(dist_config); + + auto predictor = paddle_infer::CreatePredictor(config); + int batch_size = 1; + int channels = 1; + int height = 48; + int width = 512; + int nums = batch_size * channels * height * width; + std::cout << "Created predictor." << std::endl; + + float* input = new float[nums]; + for (int i = 0; i < nums; ++i) input[i] = 0; + auto input_names = predictor->GetInputNames(); + + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape({batch_size, channels, height, width}); + input_t->CopyFromCpu(input); + std::cout << "Input data." << std::endl; + + predictor->Run(); + std::cout << "Zero Copy Run." << std::endl; + + std::vector out_data; + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + std::cout << "Output data." << std::endl; + delete[] input; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc index b29cc10e8f56f..8491d1e224930 100644 --- a/paddle/fluid/pybind/bind_fleet_executor.cc +++ b/paddle/fluid/pybind/bind_fleet_executor.cc @@ -168,7 +168,7 @@ void BindFleetExecutor(py::module* m) { .def("set_run_at_offset", &TaskNode::SetRunAtOffset) .def("set_type", &TaskNode::SetType) .def("role", &TaskNode::role) - .def("init", &TaskNode::Init) + .def("init", [](TaskNode& self) { self.Init(); }) .def("set_program", &TaskNode::SetProgram); py::class_(*m, "DistModelConfig") diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index eafd5baab7d24..9b5041154c95a 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -658,7 +658,24 @@ void BindAnalysisConfig(py::module *m) { return dynamic_cast(self.pass_builder()); }, py::return_value_policy::reference) - .def("nnadapter", &AnalysisConfig::NNAdapter); + .def("nnadapter", &AnalysisConfig::NNAdapter) + .def("set_dist_config", &AnalysisConfig::SetDistConfig) + .def("dist_config", &AnalysisConfig::dist_config); + + py::class_(*m, "DistConfig") + .def(py::init<>()) + .def("set_carrier_id", &DistConfig::SetCarrierId) + .def("set_comm_init_config", &DistConfig::SetCommInitConfig) + .def("set_endpoints", &DistConfig::SetEndpoints) + .def("set_ranks", &DistConfig::SetRanks) + .def("enable_dist_model", &DistConfig::EnableDistModel) + .def("carrier_id", &DistConfig::carrier_id) + .def("current_endpoint", &DistConfig::current_endpoint) + .def("trainer_endpoints", &DistConfig::trainer_endpoints) + .def("nranks", &DistConfig::nranks) + .def("rank", &DistConfig::rank) + .def("comm_init_config", &DistConfig::comm_init_config) + .def("use_dist_model", &DistConfig::use_dist_model); } void BindLiteNNAdapterConfig(py::module *m) { diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e372727b0f0b6..a7971763f53e1 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -2034,8 +2034,11 @@ def _prepare_fleet_executor_carrier(self, fleet_opt['task_id_to_rank'] = task_id_to_rank place = core.Place() place.set_place(self.place) + # NOTE: the last argument is used to force create some vars in root scope, + # won't be used during train. self._fleet_executor.init(carrier_id, program.desc, scope, place, - num_micro_batches, tasks, task_id_to_rank) + num_micro_batches, tasks, task_id_to_rank, + []) def _run_using_fleet_executor(self, program=None, From bc113e10487115fd91cfc738c4279372eeb7c2a2 Mon Sep 17 00:00:00 2001 From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com> Date: Wed, 2 Mar 2022 15:29:24 +0800 Subject: [PATCH 07/41] add logic kernel for mlu (#39940) --- .../operators/controlflow/compare_op_mlu.cc | 200 ++++++++++++++++++ .../unittests/mlu/test_compare_op_mlu.py | 157 ++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 paddle/fluid/operators/controlflow/compare_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc new file mode 100644 index 0000000000000..9dc287ab76a67 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -0,0 +1,200 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class EqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_EQ, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class NotEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_NE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class LessEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_LE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterThanMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GT, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +template +class GreaterEqualMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + MLUCnnlTensorDesc input_x(*x, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(x->dtype())); + MLUCnnlTensorDesc input_y(*y, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(y->dtype())); + MLUCnnlTensorDesc output(*out, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(out->dtype())); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_GE, input_x.get(), GetBasePtr(x), + input_y.get(), GetBasePtr(y), output.get(), GetBasePtr(out)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL( + equal, ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel, + ops::EqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + not_equal, ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel, + ops::NotEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_than, ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel, + ops::LessThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + less_equal, ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel, + ops::LessEqualMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_than, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel, + ops::GreaterThanMLUKernel); + +REGISTER_OP_MLU_KERNEL( + greater_equal, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel, + ops::GreaterEqualMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py new file mode 100644 index 0000000000000..87997acce02a3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_compare_op_mlu.py @@ -0,0 +1,157 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard + + +def create_test_class(op_type, typename, callback): + class Cls(OpTest): + def setUp(self): + self.set_mlu() + self.place = paddle.MLUPlace(0) + x = np.random.random(size=(10, 7)).astype(typename) + y = np.random.random(size=(10, 7)).astype(typename) + out = callback(x, y) + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': out} + self.op_type = op_type + + def set_mlu(self): + self.__class__.use_mlu = True + + def test_output(self): + self.check_output_with_place(place=self.place) + + def test_errors(self): + paddle.enable_static() + with program_guard(Program(), Program()): + a = fluid.layers.data(name='a', shape=[2], dtype='float32') + b = fluid.layers.data(name='b', shape=[2], dtype='float32') + c = fluid.layers.data(name='c', shape=[2], dtype='int16') + d = fluid.create_lod_tensor(np.array([[-1]]), [[1]], self.place) + + op = eval("fluid.layers.%s" % self.op_type) + self.assertRaises(TypeError, op, x=a, y=b, axis=True) + self.assertRaises(TypeError, op, x=a, y=b, force_cpu=1) + self.assertRaises(TypeError, op, x=a, y=b, cond=1) + self.assertRaises(TypeError, op, x=a, y=c) + self.assertRaises(TypeError, op, x=c, y=a) + self.assertRaises(TypeError, op, x=a, y=d) + self.assertRaises(TypeError, op, x=d, y=a) + self.assertRaises(TypeError, op, x=c, y=d) + + def test_dynamic_api(self): + paddle.disable_static() + paddle.set_device('mlu:0') + x = np.random.random(size=(10, 7)).astype(typename) + y = np.random.random(size=(10, 7)).astype(typename) + real_result = callback(x, y) + x = paddle.to_tensor(x, dtype=typename) + y = paddle.to_tensor(y, dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + self.assertEqual((out.numpy() == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_1(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data( + name='x', shape=[1, 2, 1, 3], dtype=typename) + y = paddle.static.data( + name='y', shape=[1, 2, 3], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename) + input_y = np.arange(0, 6).reshape((1, 2, 3)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_2(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data( + name='x', shape=[1, 2, 3], dtype=typename) + y = paddle.static.data( + name='y', shape=[1, 2, 1, 3], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(0, 6).reshape((1, 2, 3)).astype(typename) + input_y = np.arange(1, 7).reshape((1, 2, 1, 3)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_broadcast_api_3(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = paddle.static.data(name='x', shape=[5], dtype=typename) + y = paddle.static.data(name='y', shape=[3, 1], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x, y) + exe = paddle.static.Executor(self.place) + input_x = np.arange(0, 5).reshape((5)).astype(typename) + input_y = np.array([5, 3, 2]).reshape((3, 1)).astype(typename) + real_result = callback(input_x, input_y) + res, = exe.run(feed={"x": input_x, + "y": input_y}, + fetch_list=[out]) + self.assertEqual((res == real_result).all(), True) + + @unittest.skipIf(typename == 'float16', "float16 is not supported now") + def test_attr_name(self): + paddle.enable_static() + with program_guard(Program(), Program()): + x = fluid.layers.data(name='x', shape=[4], dtype=typename) + y = fluid.layers.data(name='y', shape=[4], dtype=typename) + op = eval("paddle.%s" % (self.op_type)) + out = op(x=x, y=y, name="name_%s" % (self.op_type)) + self.assertEqual("name_%s" % (self.op_type) in out.name, True) + + cls_name = "{0}_{1}".format(op_type, typename) + Cls.__name__ = cls_name + globals()[cls_name] = Cls + + +for _type_name in {'float16', 'float32', 'int32', 'bool'}: + if _type_name == 'int32' or _type_name == 'bool': + create_test_class('equal', _type_name, lambda _a, _b: _a == _b) + continue + create_test_class('equal', _type_name, lambda _a, _b: _a == _b) + create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b) + create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) + create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) + create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b) + create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b) + +if __name__ == '__main__': + unittest.main() From 0c3f7fbcfe68bfb34b0ed5d9aad6e3a8c0cca43f Mon Sep 17 00:00:00 2001 From: chenjian Date: Wed, 2 Mar 2022 15:30:09 +0800 Subject: [PATCH 08/41] Upgrade new profiler (#39984) * add new profiler components * fix bug * upgrade new profiler * fix operator.cc * fix operator.cc * fix cmakelists.txt * fix bug * fix according to pr * fix bug * fix cmake * fix bug * fix a bug * fix bug * fix bug --- paddle/fluid/framework/operator.cc | 8 +- paddle/fluid/platform/profiler/CMakeLists.txt | 10 +- .../platform/profiler/chrometracing_logger.cc | 320 ++++++++++++++---- .../platform/profiler/chrometracing_logger.h | 11 + .../platform/profiler/cpu_utilization.cc | 47 ++- .../platform/profiler/dump/CMakeLists.txt | 3 - .../profiler/dump/deserialization_reader.cc | 16 +- .../profiler/dump/deserialization_reader.h | 4 +- .../platform/profiler/dump/nodetree.proto | 27 +- .../profiler/dump/serialization_logger.cc | 12 + .../profiler/dump/serialization_logger.h | 5 + .../dump/test_serialization_logger.cc | 28 +- .../fluid/platform/profiler/event_python.cc | 122 +++++++ paddle/fluid/platform/profiler/event_python.h | 26 +- paddle/fluid/platform/profiler/profiler.cc | 35 +- paddle/fluid/platform/profiler/profiler.h | 10 +- .../fluid/platform/profiler/profiler_test.cc | 11 +- paddle/fluid/platform/profiler/trace_event.h | 2 + 18 files changed, 578 insertions(+), 119 deletions(-) mode change 100755 => 100644 paddle/fluid/platform/profiler/dump/serialization_logger.h create mode 100644 paddle/fluid/platform/profiler/event_python.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b12ad552aba6e..b91ee3c2d633d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -264,10 +264,10 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( Type(), platform::TracerEventType::Operator, 1); - // auto op_name = platform::OpName(outputs_, Type()); - // platform::RecordEvent op_name_record_event( - // op_name, platform::TracerEventType::Operator, 1, - // platform::EventRole::kUniqueOp); + auto op_name = platform::OpName(outputs_, Type()); + platform::RecordEvent op_name_record_event( + op_name, platform::TracerEventType::Operator, 10, + platform::EventRole::kUniqueOp); RunImpl(scope, place); } diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 5acdfa39569f0..c903a52530ccb 100755 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -2,10 +2,12 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) cc_library(event_node SRCS event_node.cc DEPS enforce) cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) -cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils) -cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) add_subdirectory(dump) +cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) +cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind) +cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) +cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) +cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 7b207ea7b2011..4061e2d4d494d 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -18,40 +18,17 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/os_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/chrometracing_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { static const char* kSchemaVersion = "1.0.0"; static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.json"; -static uint32_t num_span = 0; - -static int64_t nsToUs(int64_t ns) { return ns / 1000; } - -template -std::string string_format(const std::string& format, Args... args) { - int size_s = std::snprintf(nullptr, 0, format.c_str(), args...) + - 1; // Extra space for '\0' - PADDLE_ENFORCE_GE(size_s, 0, platform::errors::Fatal( - "Error during profiler data formatting.")); - auto size = static_cast(size_s); - auto buf = std::make_unique(size); - std::snprintf(buf.get(), size, format.c_str(), args...); - return std::string(buf.get(), size - 1); // exclude the '\0' -} - -std::string GetStringFormatLocalTime() { - std::time_t rawtime; - std::tm* timeinfo; - char buf[100]; - std::time(&rawtime); - timeinfo = std::localtime(&rawtime); - std::strftime(buf, 100, "%F-%X", timeinfo); - return std::string(buf); -} +static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -60,16 +37,19 @@ static std::string DefaultFileName() { } const char* ChromeTracingLogger::categary_name_[] = { - "operator", "dataloader", "profile_step", "cuda_runtime", "kernel", - "memcpy", "memset", "user_defined", "others"}; + "Operator", "Dataloader", "ProfileStep", "CudaRuntime", + "Kernel", "Memcpy", "Memset", "UserDefined", + "OperatorInner", "Forward", "Backward", "Optimization", + "Communication", "PythonOp", "PythonUserDefined"}; void ChromeTracingLogger::OpenFile() { output_file_stream_.open(filename_, std::ofstream::out | std::ofstream::trunc); if (!output_file_stream_) { - VLOG(2) << "Unable to open file for writing profiling data." << std::endl; + LOG(WARNING) << "Unable to open file for writing profiling data." + << std::endl; } else { - VLOG(0) << "writing profiling data to " << filename_ << std::endl; + LOG(INFO) << "writing profiling data to " << filename_ << std::endl; } } @@ -122,21 +102,54 @@ void ChromeTracingLogger::LogHostTraceEventNode( if (!output_file_stream_) { return; } - output_file_stream_ << string_format( - std::string( - R"JSON( + switch (host_node.Type()) { + case TracerEventType::ProfileStep: + case TracerEventType::Forward: + case TracerEventType::Backward: + case TracerEventType::Dataloader: + case TracerEventType::Optimization: + case TracerEventType::PythonOp: + case TracerEventType::PythonUserDefined: + output_file_stream_ << string_format( + std::string( + R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, + "name": "%s", "pid": %lld, "tid": "%lld(Python)", "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { - + "start_ns": %lld, + "end_ns": %lld } }, )JSON"), - host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), - nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), - categary_name_[static_cast(host_node.Type())]); + host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + categary_name_[static_cast(host_node.Type())], + host_node.StartNs(), host_node.EndNs()); + break; + default: + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "%s", "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, "dur": %lld, + "ph": "X", "cat": "%s", + "args": { + "start_ns": %lld, + "end_ns": %lld + } + }, + )JSON"), + host_node.Name().c_str(), host_node.ProcessId(), host_node.ThreadId(), + nsToUs(host_node.StartNs()), nsToUs(host_node.Duration()), + categary_name_[static_cast(host_node.Type())], + host_node.StartNs(), host_node.EndNs()); + break; + } + + pid_tid_set_.insert({host_node.ProcessId(), host_node.ThreadId()}); } void ChromeTracingLogger::LogRuntimeTraceEventNode( @@ -148,11 +161,13 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( std::string( R"JSON( { - "name": "%s", "pid": %lld, "tid": %lld, + "name": "%s", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { - "correlation id": %d + "correlation id": %d, + "start_ns": %lld, + "end_ns": %lld } }, )JSON"), @@ -160,7 +175,23 @@ void ChromeTracingLogger::LogRuntimeTraceEventNode( runtime_node.ThreadId(), nsToUs(runtime_node.StartNs()), nsToUs(runtime_node.Duration()), categary_name_[static_cast(runtime_node.Type())], - runtime_node.CorrelationId()); + runtime_node.CorrelationId(), runtime_node.StartNs(), + runtime_node.EndNs()); + pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); + + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": "%lld(C++)", + "ts": %lld, + "ph": "s", "cat": "async" + }, + )JSON"), + runtime_node.CorrelationId(), runtime_node.ProcessId(), + runtime_node.ThreadId(), + nsToUs((runtime_node.StartNs() + runtime_node.EndNs()) >> 1)); + pid_tid_set_.insert({runtime_node.ProcessId(), runtime_node.ThreadId()}); } void ChromeTracingLogger::LogDeviceTraceEventNode( @@ -180,6 +211,36 @@ void ChromeTracingLogger::LogDeviceTraceEventNode( default: break; } + if (nsToUs(device_node.Duration()) == 0) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": %lld, + "ts": %lld, + "ph": "f", "cat": "async" + }, + )JSON"), + device_node.CorrelationId(), device_node.DeviceId(), + device_node.StreamId(), nsToUs(device_node.StartNs())); + deviceid_streamid_set_.insert( + {device_node.DeviceId(), device_node.StreamId()}); + } else { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "launch", "id": %d, "pid": %lld, "tid": %lld, + "ts": %lld, + "ph": "f", "cat": "async", "bp": "e" + }, + )JSON"), + device_node.CorrelationId(), device_node.DeviceId(), + device_node.StreamId(), + nsToUs((device_node.StartNs() + device_node.EndNs()) >> 1)); + deviceid_streamid_set_.insert( + {device_node.DeviceId(), device_node.StreamId()}); + } } void ChromeTracingLogger::HandleTypeKernel( @@ -188,16 +249,21 @@ void ChromeTracingLogger::HandleTypeKernel( float blocks_per_sm = 0.0; float warps_per_sm = 0.0; float occupancy = 0.0; -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUPTI) constexpr int threads_per_warp = 32; const gpuDeviceProp& device_property = GetDeviceProperties(device_node.DeviceId()); - blocks_per_sm = - (kernel_info.grid_x * kernel_info.grid_y * kernel_info.grid_z) / - device_property.multiProcessorCount; + blocks_per_sm = static_cast(kernel_info.grid_x * kernel_info.grid_y * + kernel_info.grid_z) / + device_property.multiProcessorCount; warps_per_sm = blocks_per_sm * (kernel_info.block_x * kernel_info.block_y * kernel_info.block_z) / threads_per_warp; + occupancy = CalculateEstOccupancy( + device_node.DeviceId(), kernel_info.registers_per_thread, + kernel_info.static_shared_memory, kernel_info.dynamic_shared_memory, + kernel_info.block_x, kernel_info.block_y, kernel_info.block_z, + blocks_per_sm); #endif output_file_stream_ << string_format( @@ -208,15 +274,17 @@ void ChromeTracingLogger::HandleTypeKernel( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "device": %d, "context": %d, "stream": %d, "correlation id": %d, "registers per thread": %d, - "shared memory": %f, + "shared memory": %d, "blocks per SM": %f, "warps per SM": %f, "grid": [%d, %d, %d], "block": [%d, %d, %d], - "est. achieved occupancy %": %f + "theoretical achieved occupancy %%": %f } }, )JSON"), @@ -224,12 +292,13 @@ void ChromeTracingLogger::HandleTypeKernel( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), + device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), kernel_info.registers_per_thread, kernel_info.static_shared_memory + kernel_info.dynamic_shared_memory, blocks_per_sm, warps_per_sm, kernel_info.grid_x, kernel_info.grid_y, kernel_info.grid_z, kernel_info.block_x, kernel_info.block_y, - kernel_info.block_z, occupancy); + kernel_info.block_z, occupancy * 100); } void ChromeTracingLogger::HandleTypeMemcpy( @@ -247,6 +316,8 @@ void ChromeTracingLogger::HandleTypeMemcpy( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "stream": %d, "correlation id": %d, "bytes": %d, "memory bandwidth (GB/s)": %f } @@ -256,8 +327,8 @@ void ChromeTracingLogger::HandleTypeMemcpy( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.StreamId(), device_node.CorrelationId(), - memcpy_info.num_bytes, memory_bandwidth); + device_node.StartNs(), device_node.EndNs(), device_node.StreamId(), + device_node.CorrelationId(), memcpy_info.num_bytes, memory_bandwidth); } void ChromeTracingLogger::HandleTypeMemset( @@ -271,6 +342,8 @@ void ChromeTracingLogger::HandleTypeMemset( "ts": %lld, "dur": %lld, "ph": "X", "cat": "%s", "args": { + "start_ns": %lld, + "end_ns": %lld, "device": %d, "context": %d, "stream": %d, "correlation id": %d, "bytes": %d, "value": %d @@ -281,7 +354,8 @@ void ChromeTracingLogger::HandleTypeMemset( device_node.StreamId(), nsToUs(device_node.StartNs()), nsToUs(device_node.Duration()), categary_name_[static_cast(device_node.Type())], - device_node.DeviceId(), device_node.ContextId(), device_node.StreamId(), + device_node.StartNs(), device_node.EndNs(), device_node.DeviceId(), + device_node.ContextId(), device_node.StreamId(), device_node.CorrelationId(), memset_info.num_bytes, memset_info.value); } @@ -290,10 +364,10 @@ void ChromeTracingLogger::StartLog() { R"JSON( { "schemaVersion": "%s", - "displayTimeUnit": "us", - "SpanNumber": "%d", + "displayTimeUnit": "ms", + "span_indx": "%d", )JSON"), - kSchemaVersion, num_span); + kSchemaVersion, span_indx++); // add device property information #if defined(PADDLE_WITH_CUDA) output_file_stream_ << std::string(R"JSON( @@ -358,11 +432,143 @@ void ChromeTracingLogger::StartLog() { )JSON"); } -void ChromeTracingLogger::EndLog() { +void ChromeTracingLogger::LogMetaInfo( + const std::unordered_map extra_info) { + RefineDisplayName(extra_info); output_file_stream_ << std::string( R"JSON( {} - ] + ], + )JSON"); + output_file_stream_ << std::string(R"JSON( + "ExtraInfo": {)JSON"); + size_t count = extra_info.size(); + for (const auto& kv : extra_info) { + if (count > 1) { + output_file_stream_ << string_format(std::string(R"JSON( + "%s": "%s", + )JSON"), + kv.first.c_str(), kv.second.c_str()); + } else { + output_file_stream_ << string_format(std::string(R"JSON( + "%s": "%s" + )JSON"), + kv.first.c_str(), kv.second.c_str()); + } + count--; + } + output_file_stream_ << std::string(R"JSON( + })JSON"); +} + +void ChromeTracingLogger::RefineDisplayName( + std::unordered_map extra_info) { + for (auto it = pid_tid_set_.begin(); it != pid_tid_set_.end(); ++it) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "process_name", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "name": "Process %lld (CPU)" + } + }, + { + "name": "process_name", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "name": "Process %lld (CPU)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "name": "thread %lld:%s(Python)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "name": "thread %lld:%s(C++)" + } + }, + { + "name": "process_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": "%lld(Python)", + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": "%lld(C++)", + "ph": "M", + "args": { + "sort_index": %lld + } + }, + )JSON"), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).first, (*it).first, (*it).second, (*it).second, + extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), + (*it).first, (*it).second, (*it).second, + extra_info[string_format(std::string("%lld"), (*it).second)].c_str(), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).second * 2, (*it).first, (*it).second, (*it).second * 2 + 1); + } + + for (auto it = deviceid_streamid_set_.begin(); + it != deviceid_streamid_set_.end(); ++it) { + output_file_stream_ << string_format( + std::string( + R"JSON( + { + "name": "process_name", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "name": "Deivce %lld (GPU)" + } + }, + { + "name": "thread_name", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "name": "stream %lld" + } + }, + { + "name": "process_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + { + "name": "thread_sort_index", "pid": %lld, "tid": %lld, + "ph": "M", + "args": { + "sort_index": %lld + } + }, + )JSON"), + (*it).first, (*it).second, (*it).first, (*it).first, (*it).second, + (*it).second, (*it).first, (*it).second, (*it).first + 0x10000000, + (*it).first, (*it).second, (*it).second); + } +} + +void ChromeTracingLogger::EndLog() { + output_file_stream_ << std::string( + R"JSON( } )JSON"); } diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h index 06734418609d7..20a924a54cabd 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.h +++ b/paddle/fluid/platform/profiler/chrometracing_logger.h @@ -13,11 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/platform/profiler/output_logger.h" namespace paddle { namespace platform { +// Dump a NodeTrees into a chrome tracing file. +// A ChromeTracingLogger object can only dump a NodeTrees object, +// creates a file in the constructor and closes the file in the destructor. +// should only call LogNodeTrees and LogMetaInfo in order. class ChromeTracingLogger : public BaseLogger { public: explicit ChromeTracingLogger(const std::string& filename); @@ -28,6 +35,7 @@ class ChromeTracingLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; + void LogMetaInfo(const std::unordered_map); private: void OpenFile(); @@ -36,9 +44,12 @@ class ChromeTracingLogger : public BaseLogger { void HandleTypeMemcpy(const DeviceTraceEventNode&); void StartLog(); void EndLog(); + void RefineDisplayName(std::unordered_map); std::string filename_; std::ofstream output_file_stream_; static const char* categary_name_[]; + std::set> pid_tid_set_; + std::set> deviceid_streamid_set_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc index 672a9a154535a..ce2e49a1ccd39 100644 --- a/paddle/fluid/platform/profiler/cpu_utilization.cc +++ b/paddle/fluid/platform/profiler/cpu_utilization.cc @@ -54,19 +54,16 @@ void CpuUtilization::RecordBeginTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - while (true) { - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_start_.tms_utime, &nice_time_start_, - &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, - &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu); - if (std::string(temp_str).find("cpu") != 0) { - break; - } - if (retval != 11) { - return; - } + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_start_.tms_utime, &nice_time_start_, + &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, &irq_start_, + &softirq_start_, &steal_start_, &temp_lu, &temp_lu); + if (retval != 11) { + LOG(WARNING) + << "Failed to read cpu utilization information at record beginning." + << std::endl; } fclose(stat_file); } @@ -90,19 +87,17 @@ void CpuUtilization::RecordEndTimeInfo() { if (stat_file != nullptr) { char temp_str[200]; uint64_t temp_lu; - while (true) { - int retval = fscanf( - stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 - "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, - temp_str, &system_tms_end_.tms_utime, &nice_time_end_, - &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, - &softirq_end_, &steal_end_, &temp_lu, &temp_lu); - if (std::string(temp_str).find("cpu") != 0) { - break; - } - if (retval != 11) { - return; - } + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_end_.tms_utime, &nice_time_end_, + &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, + &softirq_end_, &steal_end_, &temp_lu, &temp_lu); + + if (retval != 11) { + LOG(WARNING) + << "Failed to read cpu utilization information at record end." + << std::endl; } fclose(stat_file); } diff --git a/paddle/fluid/platform/profiler/dump/CMakeLists.txt b/paddle/fluid/platform/profiler/dump/CMakeLists.txt index e25333f7a8a73..5045c56afbc63 100644 --- a/paddle/fluid/platform/profiler/dump/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/dump/CMakeLists.txt @@ -1,4 +1 @@ proto_library(nodetreeproto SRCS nodetree.proto) -cc_library(serialization_logger SRCS serialization_logger.cc DEPS nodetreeproto event_node) -cc_library(deserialization_reader SRCS deserialization_reader.cc DEPS nodetreeproto event_node) -cc_test(test_serialization_logger SRCS test_serialization_logger.cc DEPS serialization_logger deserialization_reader event_node) diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index d1049a7dc1908..de3411579d3e9 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" - #include +#include "paddle/fluid/platform/profiler/extra_info.h" namespace paddle { namespace platform { @@ -36,11 +36,19 @@ void DeserializationReader::OpenFile() { } } -std::unique_ptr DeserializationReader::Parse() { +std::unique_ptr DeserializationReader::Parse() { if (!node_trees_proto_->ParseFromIstream(&input_file_stream_)) { VLOG(2) << "Unable to load node trees in protobuf." << std::endl; return nullptr; } + // restore extra info + ExtraInfo extrainfo; + for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) { + ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx); + extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"), + extra_info_map.value().c_str()); + } + // restore NodeTrees std::map thread_event_trees_map; for (int node_tree_index = 0; node_tree_index < node_trees_proto_->thread_trees_size(); @@ -95,7 +103,9 @@ std::unique_ptr DeserializationReader::Parse() { } } // restore NodeTrees object - return std::unique_ptr(new NodeTrees(thread_event_trees_map)); + std::unique_ptr tree(new NodeTrees(thread_event_trees_map)); + return std::unique_ptr( + new ProfilerResult(std::move(tree), extrainfo)); } DeserializationReader::~DeserializationReader() { diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index 1ad2dabf229ad..e6feb4f9489e8 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -14,7 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" -#include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" namespace paddle { namespace platform { @@ -24,7 +24,7 @@ class DeserializationReader { explicit DeserializationReader(const std::string& filename); explicit DeserializationReader(const char* filename); ~DeserializationReader(); - std::unique_ptr Parse(); + std::unique_ptr Parse(); private: void OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 37dac0e597ce2..7016745059d40 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -32,9 +32,21 @@ enum TracerEventTypeProto { Memset = 6; // Used to mark record defined by user UserDefined = 7; - // A flag to denote the number of current types - NumTypes = 8; -} + // Used to mark operator detail, (such as infer shape, compute) + OperatorInner = 8; + // Used to mark model training or testing perspective, forward process + Forward = 9; + // Used to mark model training perspective, backward process + Backward = 10; + // Used to mark model training perspective, optimization process + Optimization = 11; + // Used to mark distributed training perspective + Communication = 12; + // Used to mark python api + PythonOp = 13; + // Used to mark python level userdefined + PythonUserDefined = 14; +}; message KernelEventInfoProto { // The X-dimension block size for the kernel. @@ -175,7 +187,14 @@ message ThreadNodeTreeProto { repeated HostTraceEventNodeProto host_nodes = 2; } +message ExtraInfoMap { + required string key = 1; + required string value = 2; +} + message NodeTreesProto { required string version = 1; - repeated ThreadNodeTreeProto thread_trees = 2; + required uint32 span_indx = 2; + repeated ThreadNodeTreeProto thread_trees = 3; + repeated ExtraInfoMap extra_info = 4; } diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index d9ed84bd438a7..73021f4362af5 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/utils.h" namespace paddle { @@ -20,6 +21,7 @@ namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; static const char* version = "1.0.0"; +static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); @@ -39,6 +41,7 @@ void SerializationLogger::OpenFile() { } node_trees_proto_ = new NodeTreesProto(); node_trees_proto_->set_version(std::string(version)); + node_trees_proto_->set_span_indx(span_indx++); } void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { @@ -240,6 +243,15 @@ void SerializationLogger::HandleTypeMemset( device_trace_event); } +void SerializationLogger::LogMetaInfo( + const std::unordered_map extra_info) { + for (const auto& kv : extra_info) { + ExtraInfoMap* extra_info_map = node_trees_proto_->add_extra_info(); + extra_info_map->set_key(kv.first); + extra_info_map->set_value(kv.second); + } +} + SerializationLogger::SerializationLogger(const std::string& filename) { filename_ = filename.empty() ? DefaultFileName() : filename; OpenFile(); diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h old mode 100755 new mode 100644 index 1295be95d4531..378834cff590d --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -11,6 +11,8 @@ limitations under the License. */ #pragma once +#include + #include "paddle/fluid/platform/profiler/dump/nodetree.pb.h" #include "paddle/fluid/platform/profiler/output_logger.h" @@ -20,6 +22,7 @@ namespace platform { // Dump a NodeTrees into a profobuf file. // A SerializationLogger object can only dump a NodeTrees object, // creates a file in the constructor and closes the file in the destructor. +// Should only call LogNodeTrees and LogMetaInfo. class SerializationLogger : public BaseLogger { public: explicit SerializationLogger(const std::string& filename); @@ -30,12 +33,14 @@ class SerializationLogger : public BaseLogger { void LogHostTraceEventNode(const HostTraceEventNode&) override; void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; + void LogMetaInfo(const std::unordered_map); private: void OpenFile(); void HandleTypeKernel(const DeviceTraceEventNode&); void HandleTypeMemset(const DeviceTraceEventNode&); void HandleTypeMemcpy(const DeviceTraceEventNode&); + std::string filename_; std::ofstream output_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index 2fe9626ec76df..dee1019da2b52 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" #include "paddle/fluid/platform/profiler/dump/serialization_logger.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" using paddle::platform::SerializationLogger; using paddle::platform::DeserializationReader; @@ -31,6 +32,7 @@ using paddle::platform::TracerEventType; using paddle::platform::KernelEventInfo; using paddle::platform::MemcpyEventInfo; using paddle::platform::MemsetEventInfo; +using paddle::platform::ProfilerResult; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; @@ -149,7 +151,8 @@ TEST(SerializationLoggerTest, dump_case1) { TEST(DeserializationReaderTest, restore_case0) { DeserializationReader reader("test_serialization_logger_case0.pb"); - std::unique_ptr tree = reader.Parse(); + auto profiler_result = reader.Parse(); + auto& tree = profiler_result->GetNodeTrees(); std::map> nodes = tree->Traverse(true); EXPECT_EQ(nodes[10].size(), 4u); @@ -172,3 +175,26 @@ TEST(DeserializationReaderTest, restore_case0) { } } } + +TEST(DeserializationReaderTest, restore_case1) { + DeserializationReader reader("test_serialization_logger_case1.pb"); + auto profiler_result = reader.Parse(); + auto& tree = profiler_result->GetNodeTrees(); + std::map> nodes = + tree->Traverse(true); + EXPECT_EQ(nodes[10].size(), 1u); + EXPECT_EQ(nodes[11].size(), 1u); + std::vector thread1_nodes = nodes[10]; + std::vector thread2_nodes = nodes[11]; + for (auto it = thread1_nodes.begin(); it != thread1_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 3u); + } + } + for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { + if ((*it)->Name() == "root node") { + EXPECT_EQ((*it)->GetChildren().size(), 0u); + EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + } + } +} diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc new file mode 100644 index 0000000000000..1a6f19d2f93af --- /dev/null +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -0,0 +1,122 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/chrometracing_logger.h" +#include "paddle/fluid/platform/profiler/dump/deserialization_reader.h" +#include "paddle/fluid/platform/profiler/dump/serialization_logger.h" +#include "paddle/fluid/platform/profiler/extra_info.h" + +namespace paddle { +namespace platform { + +HostPythonNode::~HostPythonNode() { + // delete all runtime or device nodes and recursive delete children + for (auto it = children_node_ptrs.begin(); it != children_node_ptrs.end(); + ++it) { + delete *it; + } + for (auto it = runtime_node_ptrs.begin(); it != runtime_node_ptrs.end(); + ++it) { + delete *it; + } + for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) { + delete *it; + } +} + +HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { + // Copy and transfer EventNode in NodeTree to PythonNode + if (root == nullptr) { + return nullptr; + } + // copy HostTraceEventNode and its children + HostPythonNode* host_python_node = new HostPythonNode(); + host_python_node->name = root->Name(); + host_python_node->type = root->Type(); + host_python_node->start_ns = root->StartNs(); + host_python_node->end_ns = root->EndNs(); + host_python_node->process_id = root->ProcessId(); + host_python_node->thread_id = root->ThreadId(); + for (auto it = root->GetChildren().begin(); it != root->GetChildren().end(); + ++it) { + host_python_node->children_node_ptrs.push_back(CopyTree(*it)); + } + // copy its CudaRuntimeTraceEventNode + for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin(); + runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) { + HostPythonNode* runtime_python_node = new HostPythonNode(); + runtime_python_node->name = (*runtimenode)->Name(); + runtime_python_node->type = (*runtimenode)->Type(); + runtime_python_node->start_ns = (*runtimenode)->StartNs(); + runtime_python_node->end_ns = (*runtimenode)->EndNs(); + runtime_python_node->process_id = (*runtimenode)->ProcessId(); + runtime_python_node->thread_id = (*runtimenode)->ThreadId(); + host_python_node->runtime_node_ptrs.push_back(runtime_python_node); + // copy DeviceTraceEventNode + for (auto devicenode = (*runtimenode)->GetDeviceTraceEventNodes().begin(); + devicenode != (*runtimenode)->GetDeviceTraceEventNodes().end(); + ++devicenode) { + DevicePythonNode* device_python_node = new DevicePythonNode(); + device_python_node->name = (*devicenode)->Name(); + device_python_node->type = (*devicenode)->Type(); + device_python_node->start_ns = (*devicenode)->StartNs(); + device_python_node->end_ns = (*devicenode)->EndNs(); + device_python_node->device_id = (*devicenode)->DeviceId(); + device_python_node->context_id = (*devicenode)->ContextId(); + device_python_node->stream_id = (*devicenode)->StreamId(); + runtime_python_node->device_node_ptrs.push_back(device_python_node); + } + } + return host_python_node; +} + +ProfilerResult::ProfilerResult(std::unique_ptr tree, + const ExtraInfo& extra_info) + : tree_(std::move(tree)), extra_info_(extra_info) { + if (tree_ != nullptr) { + std::map nodetrees = tree_->GetNodeTrees(); + for (auto it = nodetrees.begin(); it != nodetrees.end(); ++it) { + thread_event_trees_map_[it->first] = CopyTree(it->second); + } + } +} + +ProfilerResult::~ProfilerResult() { + // delete all root nodes + for (auto it = thread_event_trees_map_.begin(); + it != thread_event_trees_map_.end(); ++it) { + delete it->second; + } +} + +void ProfilerResult::Save(const std::string& file_name, + const std::string format) { + if (format == std::string("json")) { + ChromeTracingLogger logger(file_name); + tree_->LogMe(&logger); + logger.LogMetaInfo(GetExtraInfo()); + } else if (format == std::string("pb")) { + SerializationLogger logger(file_name); + tree_->LogMe(&logger); + logger.LogMetaInfo(GetExtraInfo()); + } + return; +} + +std::unique_ptr LoadProfilerResult(std::string filename) { + DeserializationReader reader(filename); + std::unique_ptr result = reader.Parse(); + return result; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index b0d8eaa242716..12ecb9fde32aa 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -15,8 +15,11 @@ limitations under the License. */ #pragma once #include +#include +#include #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/extra_info.h" namespace paddle { namespace platform { @@ -66,18 +69,29 @@ struct HostPythonNode { class ProfilerResult { public: ProfilerResult() : tree_(nullptr) {} - explicit ProfilerResult(NodeTrees* tree); + explicit ProfilerResult(std::unique_ptr tree, + const ExtraInfo& extra_info); ~ProfilerResult(); std::map GetData() { - return thread_event_trees_map; + return thread_event_trees_map_; } - void Save(const std::string& file_name); + std::unordered_map GetExtraInfo() { + return extra_info_.GetExtraInfo(); + } + + void Save(const std::string& file_name, + const std::string format = std::string("json")); + + std::unique_ptr& GetNodeTrees() { return tree_; } private: - std::map thread_event_trees_map; - NodeTrees* tree_; - HostPythonNode* CopyTree(HostTraceEventNode* node); + std::map thread_event_trees_map_; + std::unique_ptr tree_; + ExtraInfo extra_info_; + HostPythonNode* CopyTree(HostTraceEventNode* root); }; +std::unique_ptr LoadProfilerResult(std::string filename); + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc index 5784d6e671bbb..35dbc96874d3c 100644 --- a/paddle/fluid/platform/profiler/profiler.cc +++ b/paddle/fluid/platform/profiler/profiler.cc @@ -25,8 +25,10 @@ #endif #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/cuda_tracer.h" +#include "paddle/fluid/platform/profiler/extra_info.h" #include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/trace_event_collector.h" +#include "paddle/fluid/platform/profiler/utils.h" namespace paddle { namespace platform { @@ -44,10 +46,15 @@ std::unique_ptr Profiler::Create(const ProfilerOptions& options) { Profiler::Profiler(const ProfilerOptions& options) { options_ = options; - HostTracerOptions host_tracer_options; - host_tracer_options.trace_level = options.trace_level; - tracers_.emplace_back(new HostTracer(host_tracer_options), true); - tracers_.emplace_back(&CudaTracer::GetInstance(), false); + std::bitset<32> trace_switch(options_.trace_switch); + if (trace_switch.test(kProfileCPUOptionBit)) { + HostTracerOptions host_tracer_options; + host_tracer_options.trace_level = options_.trace_level; + tracers_.emplace_back(new HostTracer(host_tracer_options), true); + } + if (trace_switch.test(kProfileGPUOptionBit)) { + tracers_.emplace_back(&CudaTracer::GetInstance(), false); + } } Profiler::~Profiler() { alive_.store(false); } @@ -63,9 +70,10 @@ void Profiler::Start() { for (auto& tracer : tracers_) { tracer.Get().StartTracing(); } + cpu_utilization_.RecordBeginTimeInfo(); } -std::unique_ptr Profiler::Stop() { +std::unique_ptr Profiler::Stop() { SynchronizeAllDevice(); TraceEventCollector collector; for (auto& tracer : tracers_) { @@ -75,7 +83,22 @@ std::unique_ptr Profiler::Stop() { std::unique_ptr tree(new NodeTrees(collector.HostEvents(), collector.RuntimeEvents(), collector.DeviceEvents())); - return tree; + cpu_utilization_.RecordEndTimeInfo(); + ExtraInfo extrainfo; + extrainfo.AddExtraInfo(std::string("System Cpu Utilization"), + std::string("%f"), + cpu_utilization_.GetCpuUtilization()); + extrainfo.AddExtraInfo(std::string("Process Cpu Utilization"), + std::string("%f"), + cpu_utilization_.GetCpuCurProcessUtilization()); + const std::unordered_map thread_names = + collector.ThreadNames(); + for (const auto& kv : thread_names) { + extrainfo.AddExtraInfo(string_format(std::string("%llu"), kv.first), + kv.second); + } + return std::unique_ptr( + new platform::ProfilerResult(std::move(tree), extrainfo)); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/profiler.h b/paddle/fluid/platform/profiler/profiler.h index 4fc1c6daf96c7..f9a8ece050492 100644 --- a/paddle/fluid/platform/profiler/profiler.h +++ b/paddle/fluid/platform/profiler/profiler.h @@ -15,12 +15,15 @@ #pragma once #include +#include #include #include #include #include #include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/profiler/cpu_utilization.h" #include "paddle/fluid/platform/profiler/event_node.h" +#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/tracer_base.h" DECLARE_int64(host_trace_level); @@ -28,7 +31,11 @@ DECLARE_int64(host_trace_level); namespace paddle { namespace platform { +static constexpr uint32_t kProfileCPUOptionBit = 0; +static constexpr uint32_t kProfileGPUOptionBit = 1; + struct ProfilerOptions { + uint32_t trace_switch = 0; // bit 0: cpu, bit 1: gpu uint32_t trace_level = FLAGS_host_trace_level; }; @@ -40,7 +47,7 @@ class Profiler { void Start(); - std::unique_ptr Stop(); + std::unique_ptr Stop(); ~Profiler(); @@ -70,6 +77,7 @@ class Profiler { ProfilerOptions options_; uint64_t start_ns_ = UINT64_MAX; std::list tracers_; + CpuUtilization cpu_utilization_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 160c801dc6e3e..32310b9e86228 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -22,6 +22,7 @@ #ifdef PADDLE_WITH_HIP #include #endif +#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/profiler.h" @@ -30,8 +31,10 @@ TEST(ProfilerTest, TestHostTracer) { using paddle::platform::Profiler; using paddle::platform::RecordInstantEvent; using paddle::platform::TracerEventType; + using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 2; + options.trace_switch = 3; auto profiler = Profiler::Create(options); EXPECT_TRUE(profiler); profiler->Prepare(); @@ -42,7 +45,8 @@ TEST(ProfilerTest, TestHostTracer) { RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, 3); } - auto nodetree = profiler->Stop(); + auto profiler_result = profiler->Stop(); + auto& nodetree = profiler_result->GetNodeTrees(); std::set host_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto evt : pair.second) { @@ -56,8 +60,10 @@ TEST(ProfilerTest, TestHostTracer) { TEST(ProfilerTest, TestCudaTracer) { using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 0; + options.trace_switch = 3; auto profiler = Profiler::Create(options); EXPECT_TRUE(profiler); profiler->Prepare(); @@ -72,7 +78,8 @@ TEST(ProfilerTest, TestCudaTracer) { hipStreamCreate(&stream); hipStreamSynchronize(stream); #endif - auto nodetree = profiler->Stop(); + auto profiler_result = profiler->Stop(); + auto& nodetree = profiler_result->GetNodeTrees(); std::vector runtime_events; for (const auto pair : nodetree->Traverse(true)) { for (const auto host_node : pair.second) { diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index 61f96218560ec..16ef62fb51555 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -48,6 +48,8 @@ enum class TracerEventType { Communication = 12, // Used to mark python api PythonOp = 13, + // Used to mark python level userdefined + PythonUserDefined = 14, // A flag to denote the number of current types NumTypes }; From 1db188f318ae0b0292984e08afd626898e3170da Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 2 Mar 2022 15:37:29 +0800 Subject: [PATCH 09/41] [IPU] update ipu unittests p0 (#39707) * update ipu UTs part0 * rename UT * sync api changes * update uts for new api * use_ipumodel() as classmethod --- .../tests/unittests/ipu/ernie_training.py | 934 ------------------ .../fluid/tests/unittests/ipu/op_test_ipu.py | 73 +- .../unittests/ipu/test_activation_x_op_ipu.py | 133 +++ .../unittests/ipu/test_arg_max_op_ipu.py | 117 +++ .../tests/unittests/ipu/test_assign_op_ipu.py | 102 ++ .../tests/unittests/ipu/test_avg_shard_ipu.py | 112 ++- .../unittests/ipu/test_batch_norm_op_ipu.py | 108 +- ....py => test_batchs_per_step_simple_ipu.py} | 22 +- .../tests/unittests/ipu/test_cast_op_ipu.py | 111 ++- .../tests/unittests/ipu/test_concat_op_ipu.py | 93 +- .../tests/unittests/ipu/test_conv_op_ipu.py | 127 +-- .../ipu/test_cross_entropy2_op_ipu.py | 128 ++- .../tests/unittests/ipu/test_cumsum_op_ipu.py | 123 +++ 13 files changed, 950 insertions(+), 1233 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/ernie_training.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_batchs_per_step_simple.py => test_batchs_per_step_simple_ipu.py} (79%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py b/python/paddle/fluid/tests/unittests/ipu/ernie_training.py deleted file mode 100644 index ddda666db2c0c..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/ernie_training.py +++ /dev/null @@ -1,934 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# refrenece : https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/language_model/ernie - -import os -import copy -import argparse -from contextlib import contextmanager -from functools import partial - -import numpy as np -import paddle -import paddle.static -import paddle.fluid as fluid -import paddle.fluid.layers as layers -import paddle.fluid.compiler as compiler -paddle.enable_static() - -SEED = 2021 -INT_DTYPE = None - -# ernie related block -ernie_config = { - "emb_size": 128, - "emb_mapping_in": False, - "hidden_size": 192, - "num_hidden_layers": 2, - "n_layer_per_block": 2, - "num_attention_heads": 12, - "vocab_size": 300, - "max_position_embeddings": 512, - "sent_type_vocab_size": 4, - "task_type_vocab_size": 16, - "hidden_act": "gelu", - "hidden_dropout_prob": 0.0, - "attention_probs_dropout_prob": 0.0, - "preln": False, - "pre_encoder_cmd": "n", - "preprocess_cmd": "", - "postprocess_cmd": "an", - "epsilon": 1e-12, - "initializer_range": 0.02, - "seq_len": 32 -} - - -def gelu(x): - """Gaussian Error Linear Unit. - - This is a smoother version of the RELU. - Original paper: https://arxiv.org/abs/1606.08415 - Args: - x: float Tensor to perform activation. - - Returns: - `x` with the GELU activation applied. - """ - cdf = 0.5 * (1.0 + fluid.layers.tanh( - (np.sqrt(2.0 / np.pi) * (x + 0.044715 * fluid.layers.pow(x, 3.0))))) - return x * cdf - - -def pre_post_process_layer(prev_out, - out, - process_cmd, - dropout_rate=0., - epsilon=1e-12, - name=''): - """ - Add residual connection, layer normalization and droput to the out tensor - optionally according to the value of process_cmd. - This will be used before or after multi-head attention and position-wise - feed-forward networks. - """ - for cmd in process_cmd: - if cmd == "a": # add residual connection - out = out + prev_out if prev_out else out - elif cmd == "n": # add layer normalization - out = layers.layer_norm( - out, - begin_norm_axis=len(out.shape) - 1, - param_attr=fluid.ParamAttr( - name=name + '_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - name=name + '_layer_norm_bias', - initializer=fluid.initializer.Constant(0.)), - epsilon=epsilon) - elif cmd == "d": # add dropout - if dropout_rate: - out = layers.dropout( - out, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - return out - - -pre_process_layer = partial(pre_post_process_layer, None) -post_process_layer = pre_post_process_layer - - -def positionwise_feed_forward(x, - d_inner_hid, - d_hid, - dropout_rate, - hidden_act, - param_initializer=None, - name='ffn'): - """ - Position-wise Feed-Forward Networks. - This module consists of two linear transformations with a ReLU activation - in between, which is applied to each position separately and identically. - """ - - #assert hidden_act == 'gelu.approximate' - hidden = layers.fc(input=x, - size=d_inner_hid, - num_flatten_dims=2, - act=None, - param_attr=fluid.ParamAttr( - name=name + '_fc_0.w_0', - initializer=param_initializer), - bias_attr=name + '_fc_0.b_0') - hidden = gelu(hidden) - - if dropout_rate: - hidden = layers.dropout( - hidden, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - - out = layers.fc(input=hidden, - size=d_hid, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_fc_1.w_0', initializer=param_initializer), - bias_attr=name + '_fc_1.b_0') - - return out - - -def multi_head_attention(queries, - keys, - values, - attn_bias, - d_key, - d_value, - d_model, - n_head=1, - dropout_rate=0., - cache=None, - param_initializer=None, - name='multi_head_att'): - """ - Multi-Head Attention. Note that attn_bias is added to the logit before - computing softmax activiation to mask certain selected positions so that - they will not considered in attention weights. - """ - keys = queries if keys is None else keys - values = keys if values is None else values - - def __compute_qkv(queries, keys, values, n_head, d_key, d_value): - """ - Add linear projection to queries, keys, and values. - """ - q = layers.fc(input=queries, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_query_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_query_fc.b_0') - k = layers.fc(input=keys, - size=d_key * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_key_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_key_fc.b_0') - v = layers.fc(input=values, - size=d_value * n_head, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_value_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_value_fc.b_0') - - return q, k, v - - def __split_heads(x, n_head): - """ - Reshape the last dimension of inpunt tensor x so that it becomes two - dimensions and then transpose. Specifically, input a tensor with shape - [bs, max_sequence_length, n_head * hidden_dim] then output a tensor - with shape [bs, n_head, max_sequence_length, hidden_dim]. - """ - hidden_size = x.shape[-1] - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - reshaped = layers.reshape( - x=x, shape=[0, 0, n_head, hidden_size // n_head], inplace=False) - - # permuate the dimensions into: - # [batch_size, n_head, max_sequence_len, hidden_size_per_head] - return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) - - def __combine_heads(x): - """ - Transpose and then reshape the last two dimensions of inpunt tensor x - so that it becomes one dimension, which is reverse to __split_heads. - """ - if len(x.shape) == 3: return x - if len(x.shape) != 4: - raise ValueError("Input(x) should be a 4-D Tensor.") - - trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) - # The value 0 in shape attr means copying the corresponding dimension - # size of the input as the output dimension size. - return layers.reshape( - x=trans_x, - shape=[0, 0, trans_x.shape[2] * trans_x.shape[3]], - inplace=False) - - def scaled_dot_product_attention(q, k, v, attn_bias, d_key, dropout_rate): - """ - Scaled Dot-Product Attention - """ - scaled_q = layers.scale(x=q, scale=d_key**-0.5) - product = layers.matmul(x=scaled_q, y=k, transpose_y=True) - - if attn_bias: - product += attn_bias - weights = layers.softmax(product) - if dropout_rate: - weights = layers.dropout( - weights, - dropout_prob=dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) - out = layers.matmul(weights, v) - return out - - q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) - - if cache is not None: # use cache and concat time steps - # Since the inplace reshape in __split_heads changes the shape of k and - # v, which is the cache input for next time step, reshape the cache - # input from the previous time step first. - k = cache["k"] = layers.concat( - [layers.reshape( - cache["k"], shape=[0, 0, d_model]), k], axis=1) - v = cache["v"] = layers.concat( - [layers.reshape( - cache["v"], shape=[0, 0, d_model]), v], axis=1) - - q = __split_heads(q, n_head) - k = __split_heads(k, n_head) - v = __split_heads(v, n_head) - - ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_key, - dropout_rate) - - out = __combine_heads(ctx_multiheads) - - # Project back to the model size. - proj_out = layers.fc(input=out, - size=d_model, - num_flatten_dims=2, - param_attr=fluid.ParamAttr( - name=name + '_output_fc.w_0', - initializer=param_initializer), - bias_attr=name + '_output_fc.b_0') - - return proj_out - - -def encoder_layer(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd="n", - postprocess_cmd="da", - param_initializer=None, - name='', - epsilon=1e-12): - """The encoder layers that can be stacked to form a deep encoder. - This module consits of a multi-head (self) attention followed by - position-wise feed-forward networks and both the two components companied - with the post_process_layer to add residual connection, layer normalization - and droput. - """ - - attn_output = multi_head_attention( - enc_input, - None, - None, - attn_bias, - d_key, - d_value, - d_model, - n_head, - attention_dropout, - param_initializer=param_initializer, - name=name + '_multi_head_att') - - attn_output = post_process_layer( - enc_input, - attn_output, - 'an', - prepostprocess_dropout, - name=name + '_post_att', - epsilon=epsilon) - - ffd_output = positionwise_feed_forward( - attn_output, - d_inner_hid, - d_model, - relu_dropout, - hidden_act, - param_initializer=param_initializer, - name=name + '_ffn') - - post_output = post_process_layer( - attn_output, - ffd_output, - 'an', - prepostprocess_dropout, - name=name + '_post_ffn', - epsilon=epsilon) - - return post_output - - -def encoder_inner_share(enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - param_initializer=None, - name='', - n_layer_per_block=1): - """ - The encoder_inner_share is composed of n_layer_per_block layers returned by calling - encoder_layer. - """ - - for i in range(n_layer_per_block): - enc_output = encoder_layer( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - param_initializer=param_initializer, - name=name + '_layer_' + str(i), - epsilon=epsilon) - - enc_input = enc_output - - return enc_output - - -def encoder(enc_input, - attn_bias, - n_layer, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - n_layer_per_block, - param_initializer=None, - name='', - preln=False): - """ - The encoder is composed of a stack of identical layers returned by calling - encoder_layer . - """ - - for _ in range(n_layer // n_layer_per_block): - attn_bias.stop_gradient = True - attn_bias.persistable = False - enc_output = encoder_inner_share( - enc_input, - attn_bias, - n_head, - d_key, - d_value, - d_model, - d_inner_hid, - prepostprocess_dropout, - attention_dropout, - relu_dropout, - hidden_act, - preprocess_cmd, - postprocess_cmd, - epsilon, - param_initializer=param_initializer, - name=name, - n_layer_per_block=n_layer_per_block) - - enc_input = enc_output - - if preln: - enc_output = post_process_layer( - None, - enc_output, - 'n', - prepostprocess_dropout, - name='post_encoder', - epsilon=epsilon) - - enc_output = pre_process_layer( - enc_output, - preprocess_cmd, - prepostprocess_dropout, - name="post_encoder", - epsilon=epsilon) - - return enc_output - - -class ErnieModel(object): - def __init__(self, src_ids, sent_ids, pos_ids, input_mask, config): - - self._emb_size = config['emb_size'] if config[ - 'emb_mapping_in'] else config['hidden_size'] - self._hidden_size = config['hidden_size'] - self._n_layer = config['num_hidden_layers'] - self._n_head = config['num_attention_heads'] - self._voc_size = config['vocab_size'] - self._max_position_seq_len = config['max_position_embeddings'] - self._sent_types = config['sent_type_vocab_size'] - self._task_types = config['task_type_vocab_size'] - self._hidden_act = config['hidden_act'] - self._prepostprocess_dropout = config['hidden_dropout_prob'] - self._attention_dropout = config['attention_probs_dropout_prob'] - self.config = config - self.preln = config['preln'] if 'preln' in config.keys() else False - self.pre_encoder_cmd = "" if self.preln else self.config[ - 'pre_encoder_cmd'] - - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" - self._task_emb_name = "task_embedding" - self._dtype = "float32" - self._emb_dtype = "float32" - - # Initialize all weigths by truncated normal initializer, and all biases - # will be initialized by constant zero by default. - self._param_initializer = fluid.initializer.TruncatedNormal( - scale=config['initializer_range']) - - self.src_ids = src_ids - self.sent_ids = sent_ids - self.pos_ids = pos_ids - self.input_mask = input_mask - ''' - _build_position_ids: range op doesn't support - _build_input_mask: logic_not op doesn't support - ''' - - self._build_model() - - def _build_model(self, emb=None): - with fluid.ipu_shard(ipu_index=0, ipu_stage=0): - # padding id in vocabulary must be set to 0 - self.emb_out = fluid.layers.embedding( - input=self.src_ids, - size=[self._voc_size, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._word_emb_name, - initializer=self._param_initializer), - is_sparse=False) - - self.position_emb_out = fluid.layers.embedding( - input=self.pos_ids, - size=[self._max_position_seq_len, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._pos_emb_name, - initializer=self._param_initializer)) - - self.sent_emb_out = fluid.layers.embedding( - self.sent_ids, - size=[self._sent_types, self._emb_size], - dtype=self._emb_dtype, - param_attr=fluid.ParamAttr( - name=self._sent_emb_name, - initializer=self._param_initializer)) - - sum_emb = self.emb_out + self.position_emb_out + self.sent_emb_out - - sum_emb = pre_process_layer( - sum_emb, - self.config['pre_encoder_cmd'], - self._prepostprocess_dropout, - name='pre_encoder', - epsilon=self.config['epsilon']) - - if self.config['emb_mapping_in']: - sum_emb = fluid.layers.fc( - input=sum_emb, - num_flatten_dims=2, - size=self._hidden_size, - param_attr=fluid.ParamAttr( - name='emb_hidden_mapping', - initializer=self._param_initializer), - bias_attr='emb_hidden_mapping_bias') - - self_attn_mask = fluid.layers.matmul( - x=self.input_mask, y=self.input_mask, transpose_y=True) - - self_attn_mask = fluid.layers.scale( - x=self_attn_mask, - scale=10000.0, - bias=-1.0, - bias_after_scale=False) - - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - n_head_self_attn_mask = fluid.layers.stack( - x=[self_attn_mask] * self._n_head, - axis=1) # [bs, _n_head, seqlen, seq_len] - n_head_self_attn_mask.stop_gradient = True - - self._enc_out = encoder( - enc_input=sum_emb, - attn_bias=n_head_self_attn_mask, - n_layer=self._n_layer, - n_head=self._n_head, - d_key=self._hidden_size // self._n_head, - d_value=self._hidden_size // self._n_head, - d_model=self._hidden_size, - d_inner_hid=self._hidden_size * 4, - prepostprocess_dropout=self._prepostprocess_dropout, - attention_dropout=self._attention_dropout, - relu_dropout=0, - hidden_act=self._hidden_act, - preprocess_cmd=self.config['preprocess_cmd'], - postprocess_cmd=self.config['postprocess_cmd'], - param_initializer=self._param_initializer, - name='encoder', - epsilon=self.config['epsilon'], - n_layer_per_block=self.config['n_layer_per_block'], - preln=self.preln) - - def _build_position_ids(self): - d_shape = fluid.layers.shape(self.src_ids) - d_seqlen = d_shape[1] - d_batch = d_shape[0] - position_ids = fluid.layers.reshape( - fluid.layers.range( - 0, d_seqlen, 1, dtype='int32'), [1, d_seqlen, 1], - inplace=False) - position_ids = fluid.layers.expand(position_ids, [d_batch, 1, 1]) - position_ids = fluid.layers.cast(position_ids, INT_DTYPE) - position_ids.stop_gradient = True - return position_ids - - def _build_input_mask(self): - zero = fluid.layers.fill_constant([1], dtype=INT_DTYPE, value=0) - input_mask = fluid.layers.logical_not( - fluid.layers.equal(self.src_ids, zero)) # assume pad id == 0 - input_mask = fluid.layers.cast(input_mask, 'float32') - input_mask.stop_gradient = True - return input_mask - - def get_sequence_output(self): - return self._enc_out - - def get_pooled_output(self): - """Get the first feature of each sequence for classification""" - next_sent_feat = fluid.layers.slice( - input=self._enc_out, axes=[1], starts=[0], ends=[1]) - - next_sent_feat = fluid.layers.fc( - input=next_sent_feat, - size=self._hidden_size, - act="tanh", - param_attr=fluid.ParamAttr( - name="pooled_fc.w_0", initializer=self._param_initializer), - bias_attr="pooled_fc.b_0") - return next_sent_feat - - def get_next_sentence_output(self, labels): - next_sent_feat = self.get_pooled_output() - next_sent_fc_out = fluid.layers.fc( - input=next_sent_feat, - num_flatten_dims=1, - size=33, - param_attr=fluid.ParamAttr( - name="next_sent_fc.w_0", initializer=self._param_initializer), - bias_attr="next_sent_fc.b_0") - next_sent_fc_out = fluid.layers.reshape( - next_sent_fc_out, [-1, 33], inplace=False) - #next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy( - # logits=next_sent_fc_out, label=labels, return_softmax=True) - next_sent_softmax = fluid.layers.softmax(next_sent_fc_out) - next_sent_loss = fluid.layers.cross_entropy(next_sent_softmax, labels) - next_sent_acc = fluid.layers.accuracy( - input=next_sent_softmax, label=labels) - mean_next_sent_loss = fluid.layers.mean(next_sent_loss, - "mean_next_sent_loss") - return next_sent_acc, mean_next_sent_loss - - def get_lm_output(self, mask_label, mask_pos): - """Get the loss & accuracy for pretraining""" - mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32') - - # extract the first token feature in each sentence - reshaped_emb_out = fluid.layers.reshape( - x=self._enc_out, shape=[-1, self._hidden_size]) - - # extract masked tokens' feature - mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos) - if self._dtype == "float16": - mask_feat = fluid.layers.cast(x=mask_feat, dtype=self._emb_dtype) - - # transform: fc - if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise': - _hidden_act = 'gelu' - else: - _hidden_act = None - - mask_trans_feat = fluid.layers.fc( - input=mask_feat, - size=self._emb_size, - act=_hidden_act, - param_attr=fluid.ParamAttr( - name='mask_lm_trans_fc.w_0', - initializer=self._param_initializer), - bias_attr=fluid.ParamAttr(name='mask_lm_trans_fc.b_0')) - - if self._hidden_act == 'gelu' or self._hidden_act == 'gelu.precise': - pass - else: - mask_trans_feat = gelu(mask_trans_feat) - - # transform: layer norm - mask_trans_feat = fluid.layers.layer_norm( - mask_trans_feat, - begin_norm_axis=len(mask_trans_feat.shape) - 1, - param_attr=fluid.ParamAttr( - name='mask_lm_trans_layer_norm_scale', - initializer=fluid.initializer.Constant(1.)), - bias_attr=fluid.ParamAttr( - name='mask_lm_trans_layer_norm_bias', - initializer=fluid.initializer.Constant(0.)), - epsilon=self.config['epsilon']) - - mask_lm_out_bias_attr = fluid.ParamAttr( - name="mask_lm_out_fc.b_0", - initializer=fluid.initializer.Constant(value=0.0)) - - fc_out = fluid.layers.fc(input=mask_trans_feat, - size=self._voc_size, - param_attr=fluid.ParamAttr( - name="mask_lm_out_fc.w_0", - initializer=self._param_initializer), - bias_attr=mask_lm_out_bias_attr) - #mask_lm_loss = fluid.layers.softmax_with_cross_entropy( - # logits=fc_out, label=mask_label) - mask_lm_softmax = fluid.layers.softmax(fc_out) - mask_lm_loss = fluid.layers.cross_entropy(mask_lm_softmax, mask_label) - mean_mask_lm_loss = fluid.layers.mean( - mask_lm_loss, name="mean_mask_lm_loss") - - return mask_lm_loss, mean_mask_lm_loss - - def get_task_output(self, task, task_labels): - task_fc_out = fluid.layers.fc(input=self.next_sent_feat, - size=task["num_labels"], - param_attr=fluid.ParamAttr( - name=task["task_name"] + "_fc.w_0", - initializer=self._param_initializer), - bias_attr=task["task_name"] + "_fc.b_0") - #task_loss, task_softmax = fluid.layers.softmax_with_cross_entropy( - # logits=task_fc_out, label=task_labels, return_softmax=True) - task_softmax = fluid.layers.softmax(task_fc_out) - task_loss = fluid.layers.cross_entropy(task_softmax, task_labels) - task_acc = fluid.layers.accuracy(input=task_softmax, label=task_labels) - mean_task_loss = fluid.layers.mean(task_loss) - return mean_task_loss, task_acc - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(__doc__) - parser.add_argument( - "--run_on_ipu", type=bool, default=True, help="Run model with IPU") - parser.add_argument( - "--is_training", type=bool, default=True, help="Train of inference") - parser.add_argument( - "--num_ipus", type=int, default=2, help="Number of ipus") - parser.add_argument( - "--enable_pipelining", type=bool, default=False, help="Pipelining") - parser.add_argument( - "--save_model", type=bool, default=False, help="Save model or not") - parser.add_argument( - "--model_path", type=str, default="ernie", help="Save model to where") - parser.add_argument( - "--model_name", type=str, default="ernie", help="Save model name") - parser.add_argument( - "--ipu_run_steps", type=int, default=10, help="Number steps exe.run()") - parser.add_argument( - "--export_ops", type=bool, default=False, help="Export ops to ops.txt") - parser.add_argument( - "--export_ipu_idx", type=bool, default=False, help="Export op-idx pair") - args = parser.parse_args() - - # set random seed - np.random.seed(SEED) - paddle.static.default_startup_program().random_seed = SEED - paddle.static.default_main_program().random_seed = SEED - - # IPU doesn't support int64, so we change here - INT_DTYPE = "int32" if args.run_on_ipu else "int64" - - # paddle input placeholder, batch_size = 1 - micro_bs = 1 - seq_len = ernie_config["seq_len"] - input_shape = [micro_bs, seq_len, 1] - input_fields = { - 'names': [ - 'src_ids', 'sent_ids', 'pos_ids', 'input_mask', 'mask_label', - 'mask_pos' - ], - 'shapes': [ - input_shape, input_shape, input_shape, input_shape, [micro_bs, 1], - [micro_bs, 1] - ], - 'dtypes': - [INT_DTYPE, INT_DTYPE, INT_DTYPE, 'float32', INT_DTYPE, INT_DTYPE], - 'range': [[0, seq_len], [0, 4], [0, seq_len], None, [0, seq_len], - [0, seq_len]], - 'lod_levels': [0, 0, 0, 0, 0, 0], - } - - inputs = [ - fluid.data( - name=input_fields['names'][i], - shape=input_fields['shapes'][i], - dtype=input_fields['dtypes'][i], - lod_level=input_fields['lod_levels'][i]) - for i in range(len(input_fields['names'])) - ] - - # total_samples: assum disable pipelining - batches_per_step = 1 - if args.enable_pipelining: - batches_per_step = \ - ((args.num_ipus+1) if args.is_training else args.num_ipus) - total_samples = args.ipu_run_steps * batches_per_step - - total_steps = args.ipu_run_steps - if not args.run_on_ipu: # run on cpu - total_steps = total_samples // micro_bs - - # synthetic data - np_inputs = [] - for i in range(len(input_fields['names'])): - field_name = input_fields['names'][i] - if field_name == 'input_mask': - src_ids = np_inputs[0] - dtype = input_fields['dtypes'][i] - data = np.where(src_ids > 0, - np.ones_like(src_ids), - np.zeros_like(src_ids)).astype(dtype) - else: - shape = copy.copy(input_fields['shapes'][i]) - shape[0] = total_samples - min_val, max_val = input_fields['range'][i] - data = np.random.randint( - min_val, max_val, shape, dtype=input_fields['dtypes'][i]) - np_inputs.append(data) - - # paddle input placeholder - (src_ids, sent_ids, pos_ids, input_mask, mask_label, mask_pos) = inputs - - # ernie model - ernie = ErnieModel(src_ids, sent_ids, pos_ids, input_mask, ernie_config) - fetch_node = ernie.get_sequence_output() - if args.is_training: - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - _, mean_mask_lm_loss = ernie.get_lm_output(mask_label, mask_pos) - fetch_node = mean_mask_lm_loss - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(mean_mask_lm_loss) - - # place = paddle.CPUPlace() - if args.run_on_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - executor = paddle.static.Executor(place) - - # feed & fetch list - if args.is_training: - feed_list = input_fields['names'] - else: - feed_list = input_fields['names'][:4] - fetch_list = [fetch_node.name] - - # program - startup_prog = paddle.static.default_startup_program() - executor.run(startup_prog) - - main_prog = paddle.static.default_main_program() - paddle.static.save(main_prog, "model/ernie") - paddle.static.load(main_prog, "model/ernie") - - if args.run_on_ipu: - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=args.num_ipus, - is_training=args.is_training, - enable_manual_shard=args.num_ipus > 1) - ipu_strategy.SetPipeliningConfig( - enable_pipelining=args.enable_pipelining, - batches_per_step=args.num_ipus + 1) - - ipu_compiler = compiler.IPUCompiledProgram( - main_prog, ipu_strategy=ipu_strategy) - program = ipu_compiler.compile(feed_list, fetch_list) - else: - program = main_prog - - # executor run - results = [] - for i in range(total_steps): - start = i * (batches_per_step if args.run_on_ipu else 1) - end = start + (batches_per_step if args.run_on_ipu else 1) - feed_dict = { - src_ids.name: np_inputs[0][start:end], - sent_ids.name: np_inputs[1][start:end], - pos_ids.name: np_inputs[2][start:end], - input_mask.name: np_inputs[3][start:end] - } - if args.is_training: - feed_dict[mask_label.name] = np_inputs[4][start:end] - feed_dict[mask_pos.name] = np_inputs[5][start:end] - - res = executor.run(program, feed=feed_dict, fetch_list=[fetch_node]) - results.append(res) - - paddle.static.save(main_prog, "model/ernie") - - results = np.asarray(results).flatten() - if results.size > 32: - results = results[-32:] - print(results) - - if args.save_model: - full_name = args.model_path + '/' + args.model_name - if args.is_training: - fluid.save(program=main_prog, model_path=full_name) - else: - with fluid.ipu_shard(ipu_index=1, ipu_stage=1): - paddle.static.save_inference_model( - full_name, [src_ids, sent_ids, pos_ids, input_mask], - [fetch_node], executor) - - if args.export_ops: - op_type_list = [] - for op in main_prog.global_block().ops: - op_type_list.append(op.desc.type()) - - with open("ops.txt", "w") as fp: - for op_type in set(op_type_list): - fp.write(op_type + os.linesep) - - if args.export_ipu_idx: - op_ipu_idx_list = [] - for op in main_prog.global_block().ops: - if op._is_backward_op(): - continue - - op_ipu_idx_pair = [op.desc.type()] - if op.desc.has_attr("ipu_index"): - op_ipu_idx_pair.append(op.desc.attr("ipu_index")) - else: - op_ipu_idx_pair.append(-1) # not assign ipu_index - op_ipu_idx_list.append(op_ipu_idx_pair) - op_ipu_idx_list.sort(key=lambda item: item[-1]) - - with open("ops_ipu_idx.txt", "w") as fp: - for op_ipu_idx_pair in op_ipu_idx_list: - fp.write(str(op_ipu_idx_pair) + os.linesep) diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py index 0d09f60406001..790388f30ead9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py @@ -12,17 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import random import unittest - import numpy as np -from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator -from typing import Optional -import paddle.fluid.compiler as compiler - -SEED = 2021 +from enum import Enum -ipu_compiler_ref: Optional[compiler.IPUCompiledProgram] = None +import paddle +import paddle.static map_np_dtype_to_fluid_dtype = { 'bool': "bool", @@ -36,6 +33,19 @@ } +class ExecutionMode(Enum): + CPU_FP32 = 1 + IPU_FP32 = 2 + # enable_fp16 through ipu_strategy.enable_fp16 + IPU_POPART_FP16 = 3 + + def __lt__(self, other): + return self.value < other.value + + def __gt__(self, other): + return self.value > other.value + + def np_dtype_to_fluid_str(dtype: np.dtype) -> str: return map_np_dtype_to_fluid_dtype[dtype.name] @@ -43,14 +53,16 @@ def np_dtype_to_fluid_str(dtype: np.dtype) -> str: class IPUOpTest(unittest.TestCase): @classmethod def setUpClass(cls): + # Get random seeds cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() - cls.SEED = SEED + cls.SEED = 2021 np.random.seed(cls.SEED) random.seed(cls.SEED) - cls._use_system_allocator = _set_use_system_allocator(True) + # Enable paddle static graph mode + paddle.enable_static() @classmethod def tearDownClass(cls): @@ -58,14 +70,47 @@ def tearDownClass(cls): np.random.set_state(cls._np_rand_state) random.setstate(cls._py_rand_state) - _set_use_system_allocator(cls._use_system_allocator) - # unittest will to trigger IPUCompiledProgram.__del__ automatically - global ipu_compiler_ref - ipu_compiler_ref is not None and ipu_compiler_ref.clean() + @classmethod + def use_ipumodel(cls): + if 'POPLAR_IPUMODEL' not in os.environ: + return False + else: + flag = os.environ['POPLAR_IPUMODEL'] + if flag.upper() in ['1', "TRUE"]: + return True def set_atol(self): - self.atol = 1e-5 + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 def set_training(self): self.is_training = False self.epoch = 1 + + def check(self, outputs, check_shape=False): + cpu_fp32 = outputs[ExecutionMode.CPU_FP32] + ipu_fp32 = outputs[ExecutionMode.IPU_FP32] + max_diff = np.abs(cpu_fp32 - ipu_fp32).max() + fp32_flag = np.allclose( + cpu_fp32, ipu_fp32, rtol=self.rtol, atol=self.atol) + self.assertTrue(fp32_flag, "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue(cpu_fp32.shape == ipu_fp32.shape) + + ipu_popart_fp16 = None + if ExecutionMode.IPU_POPART_FP16 in outputs.keys(): + ipu_popart_fp16 = outputs[ExecutionMode.IPU_POPART_FP16] + max_diff = np.abs(ipu_popart_fp16.astype(np.float32) - + cpu_fp32).max() + fp16_flag = np.allclose( + ipu_popart_fp16.astype(np.float32), + cpu_fp32, + rtol=self.rtol_fp16, + atol=self.atol_fp16) + self.assertTrue(fp16_flag, "max diff is %f" % (max_diff)) + + if check_shape: + self.assertTrue(ipu_popart_fp16.shape == cpu_fp32.shape) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py new file mode 100644 index 0000000000000..138365b650f24 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_activation_x_op_ipu.py @@ -0,0 +1,133 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.nn.functional as F +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestRelu(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_test_op() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_test_op(self): + self.op = paddle.fluid.layers.relu + self.op_attrs = {} + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = self.op(x, **self.op_attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestTanh(TestRelu): + def set_test_op(self): + self.op = F.tanh + self.op_attrs = {} + + +class TestLog(TestRelu): + def set_test_op(self): + self.op = paddle.fluid.layers.log + self.op_attrs = {} + + +class TestSigmoid(TestRelu): + def set_test_op(self): + self.op = F.sigmoid + self.op_attrs = {} + + +class TestSqrt(TestRelu): + def set_test_op(self): + self.op = paddle.fluid.layers.sqrt + self.op_attrs = {} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py new file mode 100644 index 0000000000000..d14eba98ef5d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_arg_max_op_ipu.py @@ -0,0 +1,117 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[10, 1000]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {"axis": -1} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.fluid.layers.argmax(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0].astype(np.int32) + + def test_base(self): + output_dict_fp32 = {} + output_dict_fp16 = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + + if mode > ExecutionMode.IPU_FP32: + output_dict_fp16[mode] = self._test_base(mode).flatten() + else: + output_dict_fp32[mode] = self._test_base(mode).flatten() + + self.check(output_dict_fp32) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"axis": 0} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py new file mode 100644 index 0000000000000..4f17c90de72ad --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_assign_op_ipu.py @@ -0,0 +1,102 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + assign = paddle.assign(x) + out = paddle.fluid.layers.elementwise_add(assign, assign) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py index a23cacf47636b..f34e5b0d8b9dc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_avg_shard_ipu.py @@ -16,13 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,78 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 128, 128]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): - self.attrs = {} - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 2e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 128, 128]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - conv1 = paddle.static.nn.conv2d( + + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( + x, num_filters=3, filter_size=3, bias_attr=False) + x = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) - conv2 = paddle.static.nn.conv2d( - conv1, num_filters=3, filter_size=3, bias_attr=False) - conv3 = paddle.static.nn.conv2d( - conv2, num_filters=3, filter_size=3, bias_attr=False) - conv4 = paddle.static.nn.conv2d( - conv3, num_filters=3, filter_size=3, bias_attr=False) - fetch_list = [conv4.name] + fetch_list = [x.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=2, - is_training=self.is_training, - enable_manual_shard=True, - need_avg_shard=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + ipu_strategy.set_options({'need_avg_shard': True}) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py index 87f783dbd1c1a..1dab958c1ecbc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batch_norm_op_ipu.py @@ -16,13 +16,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,76 +27,100 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = False self.attrs['data_layout'] = 'NCHW' self.attrs['in_place'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + conv1 = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) out = paddle.fluid.layers.batch_norm(conv1, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = True self.attrs['data_layout'] = 'NCHW' @@ -108,7 +128,13 @@ def set_attrs(self): class TestCase2(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {} self.attrs['is_test'] = True self.attrs['data_layout'] = 'NCHW' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py similarity index 79% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py rename to python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py index 9b485d7794db2..ef61e651b2ad9 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_batchs_per_step_simple.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_batchs_per_step_simple_ipu.py @@ -17,8 +17,7 @@ import numpy as np import unittest import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static paddle.enable_static() SEED = 2021 @@ -28,7 +27,7 @@ "core is not compiled with IPU") class TestFunc(unittest.TestCase): def _test_func(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = SEED @@ -40,22 +39,20 @@ def _test_func(self, run_ipu=True): c, h, w = 3, 10, 10 np_image = np.random.uniform(size=[1 * bps, c, h, w]).astype(np.float32) - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name='image', shape=[n, c, h, w], dtype='float32') conv2d = paddle.static.nn.conv2d( image, num_filters=3, filter_size=3, bias_attr=False) - # paddle.mean oshape on ipu is [bps], need another mean() - # paddle.mean oshape on cpu is [1] - # out = paddle.mean(conv2d) out = conv2d if run_ipu: place = paddle.IPUPlace() else: place = paddle.CPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) @@ -63,14 +60,9 @@ def _test_func(self, run_ipu=True): feed_list = [image.name] fetch_list = [out.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - num_ipus=2, - is_training=False, - enable_manual_shard=True, - need_avg_shard=True) - ipu_strategy.SetPipeliningConfig( - enable_pipelinin=True, batches_per_step=bps) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=False) + ipu_strategy.set_pipelining_config(batches_per_step=bps) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py index 6e58f80904600..5f0eeaa2f99ab 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cast_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,14 +26,14 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() def set_atol(self): self.atol = 1e-3 - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), } @@ -47,23 +41,20 @@ def set_feed(self): def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float16' def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -82,8 +73,8 @@ def _test_base(self, run_ipu=True): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -103,27 +94,91 @@ def test_base(self): self.assertTrue(res0.shape == res1.shape) -class TestCase1(TestBase): - def set_attrs(self): +class TestCase2(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'float32' + + +class TestCase3(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'int32' + + +class TestCase4(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'float32' + + +class TestCase5(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), + } + + def set_op_attrs(self): + self.attrs = {} + self.attrs['dtype'] = 'int32' + + +class TestCase6(TestBase): + def set_atol(self): + self.atol = 1e-10 + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('int32'), + } + + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float16' @unittest.skip('float64 is not supported') class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float64' @unittest.skip('skip float16 to float32') class TestCase3(TestBase): - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float16'), } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'float32' @@ -133,13 +188,13 @@ class TestCase4(TestBase): def set_atol(self): self.atol = 1 - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.randint( low=1, high=100, size=[1, 3, 3, 3]).astype('int32'), } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['dtype'] = 'int8' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py index 094b19ce99da9..c5a8090283940 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_concat_op_ipu.py @@ -16,14 +16,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,81 +27,95 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 10, 10]) + data2 = np.random.uniform(size=[1, 3, 10, 10]) - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), + self.feed_fp32 = { + 'x': data1.astype(np.float32), + 'y': data2.astype(np.float32) + } + self.feed_fp16 = { + 'x': data1.astype(np.float16), + 'y': data2.astype(np.float16) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.concat([x, y], **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 1} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py index f28733de6b1a1..ade54fda86929 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_conv_op_ipu.py @@ -16,13 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,20 +26,30 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} self.attrs['num_filters'] = 3 self.attrs['filter_size'] = 3 @@ -54,104 +59,112 @@ def set_attrs(self): self.attrs['groups'] = 1 self.attrs['data_format'] = 'NCHW' - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = paddle.fluid.layers.conv2d(image, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['num_filters'] = 1 class TestCase2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['filter_size'] = [3, 3] class TestCase2_1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['filter_size'] = [3, 2] class TestCase3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['stride'] = [2, 3] class TestCase4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['dilation'] = [2, 2] class TestCase5(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['groups'] = 3 class TestCase6(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = 2 class TestCase7(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = [2, 3] class TestCase8(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['padding'] = [1, 2, 2, 3] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py index 3987c6cd5b386..3a21f0cb0079c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_cross_entropy2_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,44 +26,54 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 7]).astype('float32'), - "label": np.arange(3).reshape([3]).astype(np.int64), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[3, 7]) + label = np.arange(3).reshape([3, 1]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {'soft_label': False, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def np_nll_loss(self): + tmp = -np.log(self.feed_fp32['x']) + label = self.feed_fp32['label'] + indice = [range(label.shape[0]), label.flatten()] + self.np_ref = tmp[indice] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype="float32") - # [warning] Copying (host) tensor input/1 from INT64 to INT32. - # Will only warn once - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: label = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], @@ -80,52 +84,78 @@ def _test_base(self, run_ipu=True): shape=self.feed_shape[1], dtype='int64') - out = fluid.layers.cross_entropy( + out = paddle.fluid.layers.cross_entropy( input=x, label=label, **self.attrs) + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + if exec_mode != ExecutionMode.CPU_FP32: + feed['label'] = feed['label'].astype(np.int32) - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue(res0.shape == res1.shape) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + self.np_nll_loss() + + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'soft_label': False, 'ignore_index': 1, } -@unittest.skip("soft_label=True id not supported") class TestCase2(TestBase): - def set_attrs(self): + def set_data_feed(self): + x = np.random.uniform(size=[30, 70]) + label = np.arange(30).reshape([30, 1]) + + self.feed_fp32 = { + "x": x.astype(np.float32), + "label": label.astype(np.int64) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "label": label.astype(np.int32) + } + + +@unittest.skip("soft_label=True is not supported") +class TestCase3(TestBase): + def set_op_attrs(self): self.attrs = {'soft_label': True, } diff --git a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py new file mode 100644 index 0000000000000..2f1d86daf0057 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py @@ -0,0 +1,123 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + # popart unsupport fp16 cumsum + @property + def fp16_enabled(self): + return False + + def set_data_feed(self): + x = np.random.uniform(size=[1, 128]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="float32") + + out = paddle.fluid.layers.cumsum(x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": True, "reverse": False} + + +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": False, "reverse": True} + + +class TestCase3(TestBase): + def set_op_attrs(self): + self.attrs = {"exclusive": True, "reverse": True} + + +if __name__ == "__main__": + unittest.main() From 6af2729e615a8d6b3b4f96964f1c71d20b8f5517 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Wed, 2 Mar 2022 15:45:28 +0800 Subject: [PATCH 10/41] =?UTF-8?q?=E3=80=90phi=E3=80=91migrate=20gather=5Ft?= =?UTF-8?q?ree,reduce=5Fprod=20to=20phi=20(#39844)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move to phi * migrate gather_tree_op into phi * move reduce_prod tp phi * optimize code --- paddle/fluid/operators/gather_tree_op.cc | 4 +- paddle/fluid/operators/gather_tree_op.cu | 84 ------------------- paddle/fluid/operators/gather_tree_op.h | 66 --------------- .../operators/reduce_ops/reduce_prod_op.cc | 10 +-- .../operators/reduce_ops/reduce_prod_op.h | 7 -- paddle/phi/kernels/cpu/gather_tree_kernel.cc | 62 ++++++++++++++ paddle/phi/kernels/cpu/reduce_prod_kernel.cc | 44 ++++++++++ paddle/phi/kernels/funcs/reduce_functor.h | 8 ++ .../kernels/gather_tree_kernel.h} | 21 +++-- paddle/phi/kernels/gpu/gather_tree_kernel.cu | 79 +++++++++++++++++ paddle/phi/kernels/gpu/reduce_prod_kernel.cu | 43 ++++++++++ paddle/phi/kernels/reduce_prod_kernel.h | 29 +++++++ paddle/phi/ops/compat/reduce_sig.cc | 6 ++ 13 files changed, 285 insertions(+), 178 deletions(-) delete mode 100644 paddle/fluid/operators/gather_tree_op.cu delete mode 100644 paddle/fluid/operators/gather_tree_op.h create mode 100644 paddle/phi/kernels/cpu/gather_tree_kernel.cc create mode 100644 paddle/phi/kernels/cpu/reduce_prod_kernel.cc rename paddle/{fluid/operators/reduce_ops/reduce_prod_op.cu => phi/kernels/gather_tree_kernel.h} (51%) create mode 100644 paddle/phi/kernels/gpu/gather_tree_kernel.cu create mode 100644 paddle/phi/kernels/gpu/reduce_prod_kernel.cu create mode 100644 paddle/phi/kernels/reduce_prod_kernel.h diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 830134e57e0e7..2868c3697eda1 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_tree_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -73,5 +73,3 @@ selected ids. namespace ops = paddle::operators; REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); -REGISTER_OP_CPU_KERNEL(gather_tree, ops::GatherTreeOpKernel, - ops::GatherTreeOpKernel); diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu deleted file mode 100644 index 829682764a674..0000000000000 --- a/paddle/fluid/operators/gather_tree_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_tree_op.h" - -namespace paddle { -namespace operators { - -template -__global__ void GatherTree(const T *ids_data, const T *parents_data, - T *out_data, const int64_t max_length, - const int64_t batch_size, const int64_t beam_size) { - CUDA_KERNEL_LOOP(i, batch_size * beam_size) { - int batch = i / beam_size; - int beam = i % beam_size; - auto idx = - (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } -} - -template -class GatherTreeOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - auto &ids_dims = ids->dims(); - int64_t max_length = ids_dims[0]; - int64_t batch_size = ids_dims[1]; - int64_t beam_size = ids_dims[2]; - - auto &dev_ctx = ctx.cuda_device_context(); - - const int block = 512; - int max_threads = - std::min(static_cast(dev_ctx.GetMaxPhysicalThreadCount()), - batch_size * beam_size); - const int grid = std::max(max_threads / block, 1); - GatherTree<<>>(ids_data, parents_data, out_data, max_length, - batch_size, beam_size); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(gather_tree, ops::GatherTreeOpCUDAKernel, - ops::GatherTreeOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h deleted file mode 100644 index e035a30e7954f..0000000000000 --- a/paddle/fluid/operators/gather_tree_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherTreeOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *ids = ctx.Input("Ids"); - auto *parents = ctx.Input("Parents"); - auto *out = ctx.Output("Out"); - - const auto *ids_data = ids->data(); - const auto *parents_data = parents->data(); - auto *out_data = out->mutable_data(ctx.GetPlace()); - - auto &ids_dims = ids->dims(); - auto max_length = ids_dims[0]; - auto batch_size = ids_dims[1]; - auto beam_size = ids_dims[2]; - - PADDLE_ENFORCE_NOT_NULL( - ids_data, platform::errors::InvalidArgument( - "Input(Ids) of gather_tree should not be null.")); - - PADDLE_ENFORCE_NOT_NULL( - parents_data, platform::errors::InvalidArgument( - "Input(Parents) of gather_tree should not be null.")); - - for (int batch = 0; batch < batch_size; batch++) { - for (int beam = 0; beam < beam_size; beam++) { - auto idx = (max_length - 1) * batch_size * beam_size + - batch * beam_size + beam; - out_data[idx] = ids_data[idx]; - auto parent = parents_data[idx]; - for (int step = max_length - 2; step >= 0; step--) { - idx = step * batch_size * beam_size + batch * beam_size; - out_data[idx + beam] = ids_data[idx + parent]; - parent = parents_data[idx + parent]; - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc index 50df75d9ad3fd..eb745ab9c56c5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc @@ -27,15 +27,7 @@ class CPUDeviceContext; } // namespace paddle REGISTER_REDUCE_OP(reduce_prod); -REGISTER_OP_CPU_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); + REGISTER_OP_CPU_KERNEL(reduce_prod_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h index 103e108e4bda1..60dedf8d6ffb0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.h @@ -19,13 +19,6 @@ namespace paddle { namespace operators { -struct ProdFunctor { - template - void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { - y->device(place) = x->prod(dim); - } -}; - struct ProdGradFunctor { template diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc new file mode 100644 index 0000000000000..25fb870d851f6 --- /dev/null +++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gather_tree_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out) { + const auto *ids_data = ids.data(); + const auto *parents_data = parents.data(); + + T *out_data = dev_ctx.template Alloc(out); + + auto &ids_dims = ids.dims(); + auto max_length = ids_dims[0]; + auto batch_size = ids_dims[1]; + auto beam_size = ids_dims[2]; + + PADDLE_ENFORCE_NOT_NULL(ids_data, + phi::errors::InvalidArgument( + "Input(Ids) of gather_tree should not be null.")); + + PADDLE_ENFORCE_NOT_NULL( + parents_data, + phi::errors::InvalidArgument( + "Input(Parents) of gather_tree should not be null.")); + + for (int batch = 0; batch < batch_size; batch++) { + for (int beam = 0; beam < beam_size; beam++) { + auto idx = + (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; + out_data[idx] = ids_data[idx]; + auto parent = parents_data[idx]; + for (int step = max_length - 2; step >= 0; step--) { + idx = step * batch_size * beam_size + batch * beam_size; + out_data[idx + beam] = ids_data[idx + parent]; + parent = parents_data[idx + parent]; + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gather_tree, CPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/cpu/reduce_prod_kernel.cc b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc new file mode 100644 index 0000000000000..cf0179124ebdf --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_prod_kernel.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_prod_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(reduce_prod, + CPU, + ALL_LAYOUT, + phi::ReduceProdKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index ce8e095e8ac6c..aebd155ac59cb 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -33,5 +33,13 @@ struct MeanFunctor { } }; +//////// Prod Functor /////// +struct ProdFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->prod(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/phi/kernels/gather_tree_kernel.h similarity index 51% rename from paddle/fluid/operators/reduce_ops/reduce_prod_op.cu rename to paddle/phi/kernels/gather_tree_kernel.h index 2de647df8b182..e5a1a684daef0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/phi/kernels/gather_tree_kernel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,12 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#pragma once -REGISTER_OP_CUDA_KERNEL( - reduce_prod, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu new file mode 100644 index 0000000000000..a9e73ec37c8ed --- /dev/null +++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gather_tree_kernel.h" + +namespace phi { + +template +__global__ void GatherTree(const T *ids_data, + const T *parents_data, + T *out_data, + const int64_t max_length, + const int64_t batch_size, + const int64_t beam_size) { + CUDA_KERNEL_LOOP(i, batch_size * beam_size) { + int batch = i / beam_size; + int beam = i % beam_size; + auto idx = + (max_length - 1) * batch_size * beam_size + batch * beam_size + beam; + out_data[idx] = ids_data[idx]; + auto parent = parents_data[idx]; + for (int step = max_length - 2; step >= 0; step--) { + idx = step * batch_size * beam_size + batch * beam_size; + out_data[idx + beam] = ids_data[idx + parent]; + parent = parents_data[idx + parent]; + } + } +} + +template +void GatherTreeKernel(const Context &dev_ctx, + const DenseTensor &ids, + const DenseTensor &parents, + DenseTensor *out) { + const auto *ids_data = ids.data(); + const auto *parents_data = parents.data(); + T *out_data = dev_ctx.template Alloc(out); + + PADDLE_ENFORCE_NOT_NULL(ids_data, + phi::errors::InvalidArgument( + "Input(Ids) of gather_tree should not be null.")); + + PADDLE_ENFORCE_NOT_NULL( + parents_data, + phi::errors::InvalidArgument( + "Input(Parents) of gather_tree should not be null.")); + + auto &ids_dims = ids.dims(); + int64_t max_length = ids_dims[0]; + int64_t batch_size = ids_dims[1]; + int64_t beam_size = ids_dims[2]; + + const int block = 512; + int max_threads = + std::min(static_cast(dev_ctx.GetMaxPhysicalThreadCount()), + batch_size * beam_size); + const int grid = std::max(max_threads / block, 1); + GatherTree<<>>( + ids_data, parents_data, out_data, max_length, batch_size, beam_size); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + gather_tree, GPU, ALL_LAYOUT, phi::GatherTreeKernel, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu new file mode 100644 index 0000000000000..14084d0f4f3c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_prod_kernel.cu @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/reduce_prod_kernel.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(reduce_prod, + GPU, + ALL_LAYOUT, + phi::ReduceProdKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/reduce_prod_kernel.h b/paddle/phi/kernels/reduce_prod_kernel.h new file mode 100644 index 0000000000000..5e92b6c4db14e --- /dev/null +++ b/paddle/phi/kernels/reduce_prod_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ReduceProdKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 6395486ed2b72..92839fb303075 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -51,6 +51,11 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature("unregistered", {}, {}, {}); } +KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); @@ -58,3 +63,4 @@ PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); From c9cd47d96b2cccb34d8dc269a055f5b64346a10e Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Wed, 2 Mar 2022 15:58:57 +0800 Subject: [PATCH 11/41] [Auto Parallel] Adapt Partitioner & DistOp for ERNIE3.0 Inference and cache (#39895) * adapot dist op * add dist_fill_constant_batch_size_like * remvoe print * update compitable * add unitest --- .../auto_parallel/operators/__init__.py | 1 + .../auto_parallel/operators/dist_eltwise.py | 0 .../auto_parallel/operators/dist_embedding.py | 5 +- .../dist_fill_constant_batch_size_like.py | 127 ++++++++++++++++++ .../auto_parallel/operators/dist_matmul.py | 8 +- .../distributed/auto_parallel/partitioner.py | 3 + .../test_auto_parallel_while_op.py | 28 ++++ 7 files changed, 168 insertions(+), 4 deletions(-) mode change 100755 => 100644 python/paddle/distributed/auto_parallel/operators/dist_eltwise.py create mode 100644 python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py index 9f84df2d89634..db6f909f8ca7d 100644 --- a/python/paddle/distributed/auto_parallel/operators/__init__.py +++ b/python/paddle/distributed/auto_parallel/operators/__init__.py @@ -27,3 +27,4 @@ from . import dist_check_finite_and_unscale from . import dist_update_loss_scaling from . import dist_split +from . import dist_fill_constant_batch_size_like diff --git a/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py b/python/paddle/distributed/auto_parallel/operators/dist_eltwise.py old mode 100755 new mode 100644 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 94eb0d2d469f0..32f8e2acef5e1 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -155,7 +155,7 @@ def forward(ctx, *args, **kwargs): kwargs['Out']) Ids_var = main_block.var(kwargs['Ids'][0]) - Weight_var = main_block.var(kwargs['W'][0]) + Weight_var = main_block._var_recursive(kwargs['W'][0]) Out_var = main_block.var(kwargs['Out'][0]) # got dist attribute info @@ -277,7 +277,8 @@ def forward(ctx, *args, **kwargs): # param initialization sync if Weight_var.is_parameter and not op_dist_attr.is_recompute: - assert Weight_var.name not in dist_op_context.already_init_sync_vars + if Weight_var.name in dist_op_context.already_init_sync_vars: + return dist_op_context.already_init_sync_vars.add(Weight_var.name) param = startup_block.var(Weight_var.name) param_dist_attr = ctx.get_tensor_dist_attr_for_program(param) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py new file mode 100644 index 0000000000000..0c9d9eda02e1b --- /dev/null +++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +from .common import DistributedOperatorImplContainer +from .common import DistributedOperatorImpl +from .common import register_distributed_operator_impl_container +from .common import register_distributed_operator_impl +from ..utils import is_dim_shard +from ..utils import is_dim_replicate +from ..utils import is_valid_list_index +from ..utils import compute_compatible_dim_mapping +from ..utils import compute_compatible_dims_mapping +from ..utils import compute_compatible_and_update_dim_mapping +from ..utils import set_dist_op_desc_original_id +from paddle.fluid import core, unique_name +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.framework import Program, Parameter, Variable, program_guard +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from .dist_default import DistributedDefaultImpl0 + + +class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer): + def __init__(self, op_type): + super(DistributedFillConstantBatchSizeLike, self).__init__(op_type) + + +register_distributed_operator_impl_container( + DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like")) + + +class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl): + def __init__(self, name): + super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name) + self._forward_implemented = True + self._backward_implemented = True + + def is_input_compatible(self, dist_op): + + return True + + def is_output_compatible(self, dist_op): + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + out_name = op_desc.output('Out')[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + shape_list = op_desc.attr("shape") + + if len(shape_list) != len(out_dims_mapping): + return False + + return True + + def is_auto_compatible(self, dist_op): + if (not self.is_input_compatible(dist_op)) or \ + (not self.is_output_compatible(dist_op)): + return False + + out_name = op_desc.output('Out')[0] + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + in_name = op_desc.input('Input')[0] + in_dims_mapping = op_dist_attr.get_input_dims_mapping(in_name) + + # the dim_mapping of batch dimension should be the same + return out_dims_mapping[0] == in_dims_mapping[0] + + def update_dims_mapping(self, dist_op): + changed = False + op_desc = dist_op.serial_op.desc + op_dist_attr = dist_op.dist_attr + x_name = op_desc.input('X')[0] + out_name = op_desc.output('Out')[0] + x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name) + out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + + # only the batch size dimemsion of input and output are relative. + dim_changed = compute_compatible_and_update_dim_mapping( + [x_dims_mapping, out_dims_mapping], [0, 0]) + if dim_changed: + changed = True + + return changed + + @staticmethod + def forward(ctx, *args, **kwargs): + """ + kwargs: inputname_mapping & outputname_mapping + """ + DistributedDefaultImpl0.forward(ctx, *args, **kwargs) + dist_op_context = ctx.dist_op_context + src_op = dist_op_context.cur_src_op + op_dist_attr = ctx.get_op_dist_attr_for_program(src_op) + main_block = dist_op_context.work_block + op = main_block.ops[-1] + assert op.type == "fill_constant_batch_size_like" + + # modify shape attr according to how output are partitioned + out_name = op.output('Out')[0] + dims_mapping = op_dist_attr.get_output_dims_mapping(out_name) + process_mesh_shape = op_dist_attr.process_mesh.topology + shape_list = op.attr("shape") + # modify target shape + for idx, axis in enumerate(dims_mapping): + if axis >= 0: + shape_list[idx] = shape_list[idx] // process_mesh_shape[axis] + + op._set_attr("shape", shape_list) + main_block._sync_with_cpp() + + @staticmethod + def backward(ctx, *args, **kwargs): + DistributedDefaultImpl0.backward(ctx, *args, **kwargs) + + +register_distributed_operator_impl( + "fill_constant_batch_size_like", + DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape")) diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py index 9eb24a65e608c..058ae1d0a9fd5 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py @@ -433,8 +433,8 @@ def _right_operand_parameter_matmul_backward(ctx, *args, **kwargs): def _init_param_sync(Weight_var, dist_op_context, startup_block, ctx, rank_id): - assert Weight_var.name not in dist_op_context.already_init_sync_vars, "{} is in {}.".format( - Weight_var.name, dist_op_context.already_init_sync_vars) + if Weight_var.name in dist_op_context.already_init_sync_vars: + return assert startup_block.has_var(Weight_var.name) dist_op_context.already_init_sync_vars.add(Weight_var.name) param = startup_block.var(Weight_var.name) @@ -819,6 +819,8 @@ def forward(ctx, *args, **kwargs): out_var_dist_attr) intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_allreduce_sum", 'tmp'])), shape=Out_var.shape, dtype=Out_var.dtype, type=Out_var.type, @@ -1323,6 +1325,8 @@ def forward(ctx, *args, **kwargs): out_var_dist_attr) intermediate_var_0 = main_block.create_var( + name=unique_name.generate_with_ignorable_key(".".join( + ["c_allreduce_sum", 'tmp'])), shape=Out_var.shape, dtype=Out_var.dtype, type=Out_var.type, diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py index 2f88407c093a5..ed5ec85d84f22 100644 --- a/python/paddle/distributed/auto_parallel/partitioner.py +++ b/python/paddle/distributed/auto_parallel/partitioner.py @@ -285,6 +285,9 @@ def _get_dist_shape(var, dist_attr): var_shape = var.shape mapping = dist_attr.dims_mapping mesh = dist_attr.process_mesh.topology + if mapping == []: + return var_shape + assert len(var_shape) == len( mapping ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format( diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py index 1cd8f8f3e7083..07e6a2c4346da 100644 --- a/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py +++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_auto_parallel_while_op.py @@ -174,6 +174,7 @@ def get_program(): dtype='float32') label = static.data( name="label", shape=[batch_size, sequence_len, 1], dtype='float32') + data_holder = [input, label] # dataloader dataloader = paddle.io.DataLoader.from_generator( @@ -194,6 +195,17 @@ def get_program(): "dims_mapping": [-1, -1, -1] }) + # fill constant bsz like + tmp = paddle.fluid.layers.fill_constant_batch_size_like( + input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0) + auto.shard_tensor( + tmp, + dist_attr={ + "process_mesh": _g_process_mesh, + "dims_mapping": [-1, 0, -1, -1] + }) + + # model mlp_start = MLPLayer( hidden_size=hidden_size, intermediate_size=4 * hidden_size, @@ -395,6 +407,9 @@ def completion(train_program, start_program, dist_context): op_dist_attr.impl_idx = 0 else: op_dist_attr.impl_idx = 1 + elif op.type == "fill_constant_batch_size_like": + op_dist_attr.impl_type = "fill_constant_batch_size_like" + op_dist_attr.impl_idx = 0 else: op_dist_attr.impl_type = "default" op_dist_attr.impl_idx = 0 @@ -428,6 +443,12 @@ def test_partitioner(self): dist_main_prog, dist_startup_prog = partition( train_program, start_program, dist_context) global_block_ops = dist_main_prog.blocks[0].ops + + fill_op = None + for op in global_block_ops: + if op.type == "fill_constant_batch_size_like": + fill_op = op + global_block_ops = [op.type for op in global_block_ops] sub_block_ops = dist_main_prog.blocks[1].ops sub_block_ops = [op.type for op in sub_block_ops] @@ -435,6 +456,13 @@ def test_partitioner(self): self.assertTrue("c_allreduce_sum" in global_block_ops) self.assertTrue("c_allreduce_sum" in sub_block_ops) + # test fill_constant_batch_size_like + + self.assertTrue(fill_op is not None) + ref_shape = [-1, 8, 0, 48] + shape = fill_op.attr("shape") + self.assertTrue(ref_shape == shape) + if __name__ == "__main__": unittest.main() From 4a4215ffad5efada31dcdae9262a806635b1f226 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Wed, 2 Mar 2022 16:14:31 +0800 Subject: [PATCH 12/41] [bf16] add bf16 kernel: softmax & log_softmax (#39999) * add softmax log_softmax * refine rocm * refine unittest --- paddle/fluid/operators/log_softmax_op.cu | 16 ++-- paddle/fluid/operators/math/softmax.cu | 13 +++ paddle/fluid/operators/math/softmax_impl.h | 91 +++++++++++++++++++ .../platform/device/gpu/rocm/miopen_helper.h | 17 ++++ paddle/phi/common/amp_type_traits.h | 42 +++++++++ paddle/phi/common/bfloat16.h | 18 ++-- paddle/phi/common/float16.h | 12 --- paddle/phi/kernels/gpu/softmax_grad_kernel.cu | 4 +- paddle/phi/kernels/gpu/softmax_kernel.cu | 4 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 12 +++ .../gpudnn/softmax_grad_kernel_gpudnn.cu | 14 ++- .../kernels/gpudnn/softmax_kernel_gpudnn.cu | 14 ++- .../fluid/tests/unittests/test_log_softmax.py | 30 +++++- .../fluid/tests/unittests/test_softmax_op.py | 52 ++++++++++- 14 files changed, 305 insertions(+), 34 deletions(-) create mode 100644 paddle/phi/common/amp_type_traits.h diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 034e67568b34c..8770abdac838f 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -13,9 +13,9 @@ // limitations under the License. #include -#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/functors.h" @@ -311,7 +311,7 @@ void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -433,7 +433,7 @@ void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename details::MPTypeTrait::Type; + using MPDType = typename phi::dtype::MPTypeTrait::Type; public: void Compute(const framework::ExecutionContext &context) const override { @@ -468,16 +468,18 @@ class LogSoftmaxGradKernel } }; -} // operators -} // paddle +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, - ops::LogSoftmaxKernel); + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); REGISTER_OP_CUDA_KERNEL( log_softmax_grad, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, - ops::LogSoftmaxGradKernel); + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index fd879e9e6ffe7..83b124902ebb7 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -120,6 +120,10 @@ template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; +#if CUDNN_VERSION_MIN(8, 1, 0) +template class SoftmaxCUDNNFunctor; +template class SoftmaxGradCUDNNFunctor; +#endif // MIOPEN do not support double #ifndef PADDLE_WITH_HIP @@ -131,6 +135,10 @@ template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -139,9 +147,13 @@ template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; template class SoftmaxFunctor; @@ -149,6 +161,7 @@ template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index d51d638e0c19f..9833b4447ec45 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -156,6 +156,65 @@ class SoftmaxEigen { } }; +template +class SoftmaxEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* X, framework::Tensor* Y) { + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; + + auto logits = EigenMatrix::From(*X); + auto softmax = EigenMatrix::From(*Y); + + const int batch_size = logits.dimension(kBatchDim); + const int num_classes = logits.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); + Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + softmax.device(*context.eigen_device()) = + (logits - + logits.maximum(along_axis) + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } + + softmax.device(*context.eigen_device()) = softmax.exp(); + softmax.device(*context.eigen_device()) = + (softmax * + softmax.reshape(batch_axis_remain) + .sum(along_axis) + .inverse() + .broadcast(one_axis)); + } +}; + template void SoftmaxFunctor::operator()( const DeviceContext& context, const int axis_dim, @@ -289,6 +348,38 @@ class SoftmaxGradEigen { } }; +template +class SoftmaxGradEigen { + public: + void operator()(const DeviceContext& context, const int axis_dim, + const framework::Tensor* y, const framework::Tensor* y_grad, + framework::Tensor* x_grad) { + auto softmax = EigenMatrix::From(*y); + auto softmax_grad = EigenMatrix::From(*y_grad); + auto logits_grad = EigenMatrix::From(*x_grad); + + constexpr int kBatchDim = 0; + constexpr int kClassDim = 1; + + const int batch_size = softmax.dimension(kBatchDim); + const int num_classes = softmax.dimension(kClassDim); + const int num_remain = num_classes / axis_dim; + + Eigen::DSizes along_class(kClassDim); + Eigen::DSizes batch_by_one(batch_size, 1); + Eigen::DSizes one_by_class(1, num_classes); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes one_axis(1, axis_dim); + + auto dot = (softmax * softmax_grad) + .reshape(batch_axis_remain) + .sum(along_class) + .broadcast(one_axis); + logits_grad.device(*context.eigen_device()) = + (softmax_grad - dot) * softmax; + } +}; + template void SoftmaxGradFunctor::operator()( const DeviceContext& context, const int axis_dim, diff --git a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h index 34b9d57e055d5..1a514d2aca267 100644 --- a/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h +++ b/paddle/fluid/platform/device/gpu/rocm/miopen_helper.h @@ -140,6 +140,23 @@ class CudnnDataType { } }; +template <> +class CudnnDataType { + public: + static const miopenDataType_t type = miopenBFloat16; + // The scaling param type is float for HALF and FLOAT tensors + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; + template <> class CudnnDataType { public: diff --git a/paddle/phi/common/amp_type_traits.h b/paddle/phi/common/amp_type_traits.h new file mode 100644 index 0000000000000..ce3a469f5aedd --- /dev/null +++ b/paddle/phi/common/amp_type_traits.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" + +namespace phi { +namespace dtype { + +template +class MPTypeTrait { + public: + using Type = T; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + +} // namespace dtype +} // namespace phi diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h index 3fd8eb1b2684a..cf99bb8f19af0 100644 --- a/paddle/phi/common/bfloat16.h +++ b/paddle/phi/common/bfloat16.h @@ -377,31 +377,31 @@ struct numeric_limits { static const bool traps = true; static const bool tinyness_before = false; - static phi::dtype::bfloat16(min)() { + HOSTDEVICE static phi::dtype::bfloat16(min)() { return phi::dtype::raw_uint16_to_bfloat16(0x007f); } - static phi::dtype::bfloat16 lowest() { + HOSTDEVICE static phi::dtype::bfloat16 lowest() { return phi::dtype::raw_uint16_to_bfloat16(0xff7f); } - static phi::dtype::bfloat16(max)() { + HOSTDEVICE static phi::dtype::bfloat16(max)() { return phi::dtype::raw_uint16_to_bfloat16(0x7f7f); } - static phi::dtype::bfloat16 epsilon() { + HOSTDEVICE static phi::dtype::bfloat16 epsilon() { return phi::dtype::raw_uint16_to_bfloat16(0x3400); } - static phi::dtype::bfloat16 round_error() { + HOSTDEVICE static phi::dtype::bfloat16 round_error() { return phi::dtype::bfloat16(0.5); } - static phi::dtype::bfloat16 infinity() { + HOSTDEVICE static phi::dtype::bfloat16 infinity() { return phi::dtype::raw_uint16_to_bfloat16(0x7f80); } - static phi::dtype::bfloat16 quiet_NaN() { + HOSTDEVICE static phi::dtype::bfloat16 quiet_NaN() { return phi::dtype::raw_uint16_to_bfloat16(0xffc1); } - static phi::dtype::bfloat16 signaling_NaN() { + HOSTDEVICE static phi::dtype::bfloat16 signaling_NaN() { return phi::dtype::raw_uint16_to_bfloat16(0xff81); } - static phi::dtype::bfloat16 denorm_min() { + HOSTDEVICE static phi::dtype::bfloat16 denorm_min() { return phi::dtype::raw_uint16_to_bfloat16(0x0001); } }; diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 6ed9c88d70510..1cdcdef2c12ee 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -988,18 +988,6 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) { return os; } -template -class MPTypeTrait { - public: - using Type = T; -}; - -template <> -class MPTypeTrait { - public: - using Type = float; -}; - } // namespace dtype } // namespace phi diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu index aa496d3cd391b..04052e0dfc39a 100644 --- a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/softmax_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h" @@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax_grad, phi::SoftmaxGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu index 32efb9b776419..03c5714b96784 100644 --- a/paddle/phi/kernels/gpu/softmax_kernel.cu +++ b/paddle/phi/kernels/gpu/softmax_kernel.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/softmax_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/softmax_kernel_impl.h" @@ -25,4 +26,5 @@ PD_REGISTER_KERNEL(softmax, phi::SoftmaxRawKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index 45798b88bb58a..c9c549379bbce 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/axis_utils.h" @@ -47,6 +49,11 @@ class VecT4 { public: using Type = int2; }; +template <> +class VecT4 { + public: + using Type = int2; +}; // Vectorization trait 2 * sizeof(T) template @@ -66,6 +73,11 @@ class VecT2 { public: using Type = int; }; +template <> +class VecT2 { + public: + using Type = int; +}; static inline int log2_ceil(int value) { int log2_value = 0; diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu index 56e5fef6e37e4..45ab645d37367 100644 --- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu +++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu @@ -38,7 +38,18 @@ PD_REGISTER_KERNEL(softmax_grad, ALL_LAYOUT, phi::SoftmaxGradGPUDNNKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(softmax_grad, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxGradGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(softmax_grad, GPUDNN, @@ -48,3 +59,4 @@ PD_REGISTER_KERNEL(softmax_grad, double, phi::dtype::float16) {} #endif +#endif diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu index 427d1729a13a8..7685c7dbb6894 100644 --- a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu +++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu @@ -37,7 +37,18 @@ PD_REGISTER_KERNEL(softmax, ALL_LAYOUT, phi::SoftmaxRawGPUDNNKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} +#else +#if CUDNN_VERSION_MIN(8, 1, 0) +PD_REGISTER_KERNEL(softmax, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxRawGPUDNNKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} #else PD_REGISTER_KERNEL(softmax, GPUDNN, @@ -47,3 +58,4 @@ PD_REGISTER_KERNEL(softmax, double, phi::dtype::float16) {} #endif +#endif diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py index d1437ca9c96f1..16f954708d4d4 100644 --- a/python/paddle/fluid/tests/unittests/test_log_softmax.py +++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py @@ -14,8 +14,9 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 import paddle +import paddle.fluid.core as core import paddle.nn.functional as F np.random.seed(10) @@ -74,6 +75,33 @@ def set_attrs(self): self.axis = 1 +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestLogSoftmaxBF16Op(OpTest): + def setUp(self): + self.op_type = 'log_softmax' + self.dtype = np.uint16 + self.shape = [2, 3, 4, 5] + self.axis = -1 + + x = np.random.uniform(0.1, 1., self.shape).astype(np.float32) + out = np.apply_along_axis(ref_log_softmax, self.axis, x) + self.x_grad = ref_log_softmax_grad(x, self.axis) + + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + self.attrs = {'axis': self.axis} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], ['Out'], user_defined_grads=[self.x_grad]) + + class TestNNLogSoftmaxAPI(unittest.TestCase): def setUp(self): self.x_shape = [2, 3, 4, 5] diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index a1cbefa40f307..4f1c37a242474 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard @@ -296,6 +296,56 @@ def get_x_shape(self): return [2, 3, 4, 5] +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxBF16Op(OpTest): + def setUp(self): + self.op_type = "softmax" + self.use_cudnn = self.init_cudnn() + self.use_mkldnn = False + self.dtype = np.uint16 + self.shape = [10, 10] + self.axis = -1 + + np.random.seed(0) + x = np.random.uniform(0.1, 1, self.shape).astype(np.float32) + out = np.apply_along_axis(stable_softmax, self.axis, x) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(x)) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + self.attrs = { + 'axis': self.axis, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn + } + + def init_cudnn(self): + return False + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place( + place, check_dygraph=(self.use_mkldnn == False)) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ["X"], + "Out", + numeric_grad_delta=0.05, + check_dygraph=(self.use_mkldnn == False)) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestSoftmaxBF16CUDNNOp(TestSoftmaxBF16Op): + def init_cudnn(self): + return True + + class TestSoftmaxAPI(unittest.TestCase): def setUp(self): self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( From 07dad6d6ec415758d520e33960a0c53e50ef2ab5 Mon Sep 17 00:00:00 2001 From: huzhiqiang <912790387@qq.com> Date: Wed, 2 Mar 2022 02:16:04 -0600 Subject: [PATCH 13/41] [Infrt]add phi kernel dialect (#39726) --- .gitignore | 3 + .../pybind/kernel_signature_generator.cc | 26 +- paddle/infrt/dialect/infrt/common_type.h | 18 +- paddle/infrt/dialect/infrt/infrt_ops_base.td | 7 +- paddle/infrt/dialect/init_infrt_dialects.cc | 4 + paddle/infrt/dialect/phi/ir/CMakeLists.txt | 7 +- .../infrt/dialect/phi/ir/infrt_phi_kernel.td | 24 +- .../infrt/dialect/phi/ir/infrt_phi_tensor.td | 11 +- paddle/infrt/dialect/phi/ir/phi_kernels.cc | 44 +++ paddle/infrt/dialect/phi/ir/phi_kernels.h | 42 +++ .../infrt/dialect/phi/pass/kernel_op_desc.cc | 45 ++- paddle/infrt/host_context/mlir_exec.cc | 2 + paddle/infrt/kernel/phi/context_kernels.cc | 8 +- paddle/infrt/kernel/phi/context_kernels.h | 3 +- .../infrt/kernel/phi/dense_tensor_kernels.cc | 34 ++- .../infrt/kernel/phi/dense_tensor_kernels.h | 3 +- .../infershaped/infershape_launchers_test.cc | 2 +- paddle/infrt/kernel/phi/registry.cc | 2 + .../tests/dialect/pten/dense_tensor.mlir | 12 +- paddle/scripts/infrt_build.sh | 4 +- tools/infrt/generate_phi_kernel_dialect.py | 276 ++++++++++++++++++ tools/infrt/get_phi_kernel_info.py | 12 +- 22 files changed, 536 insertions(+), 53 deletions(-) create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.cc create mode 100644 paddle/infrt/dialect/phi/ir/phi_kernels.h create mode 100644 tools/infrt/generate_phi_kernel_dialect.py diff --git a/.gitignore b/.gitignore index cecd6fa91c754..debec551d9cd7 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,9 @@ tools/__pycache__ # This file is automatically generated. # TODO(zhiqiang) Move this file to build directory. paddle/infrt/dialect/pd_ops.td +paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td +paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td +tools/infrt/kernels.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 8283a249ded4c..f0d5a4e477fe4 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -49,24 +49,30 @@ int main(int argc, char **argv) { if (kernel_signature_map.Has(op_kernel_pair.first)) { std::cout << "\"" << op_kernel_pair.first << "\":{"; auto &args = kernel_signature_map.Get(op_kernel_pair.first).args; + std::cout << "\"inputs\":["; - for (auto name : std::get<0>(args)) { - std::cout << "\"" << name << "\","; + auto inputs_ = std::get<0>(args); + if (inputs_.size() > 0) std::cout << inputs_[0]; + for (size_t i = 1; i < inputs_.size(); i++) { + std::cout << ",\"" << inputs_[i] << "\""; } - if (std::get<0>(args).size() > 0) std::cout << "\b"; + std::cout << "],\"attrs\":["; - for (auto name : std::get<1>(args)) { - std::cout << "\"" << name << "\","; + auto attrs_ = std::get<1>(args); + if (attrs_.size() > 0) std::cout << attrs_[0]; + for (size_t i = 1; i < attrs_.size(); i++) { + std::cout << ",\"" << attrs_[i] << "\""; } - if (std::get<1>(args).size() > 0) std::cout << "\b"; + std::cout << "],\"outputs\":["; - for (auto name : std::get<2>(args)) { - std::cout << "\"" << name << "\","; + auto outputs_ = std::get<2>(args); + for (size_t i = 1; i < outputs_.size(); i++) { + std::cout << ",\"" << outputs_[i] << "\""; } - if (std::get<2>(args).size() > 0) std::cout << "\b"; + std::cout << "]},"; } } - std::cout << "\b}" << std::endl; + std::cout << "}" << std::endl; return 0; } diff --git a/paddle/infrt/dialect/infrt/common_type.h b/paddle/infrt/dialect/infrt/common_type.h index d6d6503c03be5..436e7920ca5c6 100644 --- a/paddle/infrt/dialect/infrt/common_type.h +++ b/paddle/infrt/dialect/infrt/common_type.h @@ -21,8 +21,22 @@ namespace infrt { enum class TargetType : uint8_t { CPU, GPU, UNK }; -enum class PrecisionType : uint8_t { FLOAT32, FLOAT16, UNK }; -enum class LayoutType : uint8_t { NCHW, NHWC, UNK }; +enum class LayoutType : uint8_t { NCHW, NHWC, ANY, UNK }; +enum class PrecisionType : uint8_t { + UINT8, + INT8, + INT16, + INT32, + INT64, + FLOAT16, + BFLOAT16, + FLOAT32, + FLOAT64, + COMPLEX64, + COMPLEX128, + BOOL, + UNK +}; struct Place { TargetType target; diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td index 978b126d75416..f19912dc0cd59 100644 --- a/paddle/infrt/dialect/infrt/infrt_ops_base.td +++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td @@ -34,9 +34,10 @@ def DenseTensor : Infrt_Type<"DenseTensor"> { let summary = "infrt dense tensor"; let description = [{dense_tensor<, 3>}]; let parameters = (ins - "TargetType":$target, - "PrecisionType":$precision, - "LayoutType":$layout + "::infrt::TargetType":$target, + "::infrt::PrecisionType":$precision, + "::infrt::LayoutType":$layout + ); } diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc index c5c81b4b0f22d..5eae01719361d 100644 --- a/paddle/infrt/dialect/init_infrt_dialects.cc +++ b/paddle/infrt/dialect/init_infrt_dialects.cc @@ -23,6 +23,8 @@ #include "paddle/infrt/dialect/pd_ops.h" #include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h" #include "paddle/infrt/dialect/phi/ir/phi_base.h" +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" + #include "paddle/infrt/dialect/tensor_shape.h" namespace infrt { @@ -34,6 +36,8 @@ void registerCinnDialects(mlir::DialectRegistry ®istry) { // NOLINT mlir::pd::PaddleDialect, #ifdef INFRT_WITH_PHI phi::PHIDenseTensorDialect, + phi::PHICPUKernelDialect, + phi::PHIGPUKernelDialect, phi::PHIDialect #endif >(); diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt index 8c1d75629d09c..0497b9832118f 100644 --- a/paddle/infrt/dialect/phi/ir/CMakeLists.txt +++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt @@ -1,9 +1,12 @@ #mlir_tablegen_on(infrt_phi_base DIALECT phi) add_mlir_dialect(infrt_phi_base phi) add_mlir_dialect(infrt_phi_tensor phi_dt) -add_mlir_dialect(infrt_phi_kernel phi_kernel) +add_mlir_dialect(phi_cpu_kernels phi_cpu) +add_mlir_dialect(phi_gpu_kernels phi_gpu) + #mlir_tablegen_on(infrt_phi_tensor) gather_srcs(infrt_src SRCS phi_base.cc - infrt_phi_tensor.cc) + infrt_phi_tensor.cc + phi_kernels.cc) diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td index 37bf0b5ef213d..ee23470fc754a 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td @@ -6,24 +6,32 @@ include "mlir/IR/OpBase.td" include "paddle/infrt/dialect/infrt_base.td" include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td" -def PHI_KernelDialect : Dialect { - let name = "phi_kernel"; +def PHI_CPUKernelDialect : Dialect { + let name = "phi_cpu"; let description = [{ - The PHI Kernel dialect. + The PHI CPU Kernel dialect. + }]; + + let cppNamespace = "::infrt::phi"; +} + +def PHI_GPUKernelDialect : Dialect { + let name = "phi_gpu"; + + let description = [{ + The PHI GPU Kernel dialect. }]; let cppNamespace = "::infrt::phi"; } // PHI Kernel related ops. -class PDT_Kernel traits = []> : Op { +class PDTCPU_Kernel traits = []> : Op { } -def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> { - let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x); - let results = (outs DenseTensor:$output); +// PHI Kernel related ops. +class PDTGPU_Kernel traits = []> : Op { } #endif - diff --git a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td index dc3a4b340d767..39677871ff8fe 100644 --- a/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td @@ -34,6 +34,14 @@ class FillDenseTensorOp : attr_type:$value ); let results = (outs); + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; +} + +class PrintDenseTensorOp: + PDT_Op<"print_tensor"> { + let arguments = (ins DenseTensor:$input); + let results = (outs); + let assemblyFormat = "`(` $input `:` type($input) `)` attr-dict"; } class CreateCPUAllocatorOp @@ -44,7 +52,7 @@ class CreateCPUAllocatorOp class CreateCPUContextOp : PDT_Op<"create_context." # "cpu", [NoSideEffect]> { - let arguments = (ins); + let arguments = (ins CPU_Allocator:$input); let results = (outs CPU_Context:$output); } @@ -52,6 +60,7 @@ def PDT_CreateDenseTensorOp_cpu_f32_nchw : CreateDenseTensorOp<"cpu", "f32", "nc def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp; def PDT_CreateContextOp_cpu : CreateCPUContextOp; +def PDT_PrintDenseTensor_cpu : PrintDenseTensorOp; def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.cc b/paddle/infrt/dialect/phi/ir/phi_kernels.cc new file mode 100644 index 0000000000000..c7a837b83fc24 --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/phi/ir/phi_kernels.h" +#include + +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc" // NOLINT + +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.cpp.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc" // NOLINT + +namespace infrt { +namespace phi { + +void PHICPUKernelDialect::initialize() { +#define GET_OP_LIST + addOperations< +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.cpp.inc" // NOLINT + >(); +} + +void PHIGPUKernelDialect::initialize() { +#define GET_OP_LIST + addOperations< +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.cpp.inc" // NOLINT + >(); +} + +} // namespace phi +} // namespace infrt diff --git a/paddle/infrt/dialect/phi/ir/phi_kernels.h b/paddle/infrt/dialect/phi/ir/phi_kernels.h new file mode 100644 index 0000000000000..b84d1b2b7294b --- /dev/null +++ b/paddle/infrt/dialect/phi/ir/phi_kernels.h @@ -0,0 +1,42 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/infrt/dialect/dense_tensor.h" +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/phi/ir/phi_base.h" + +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernelsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_cpu_kernels.h.inc" + +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernelsDialect.h.inc" +#define GET_OP_CLASSES +#include "paddle/infrt/dialect/phi/ir/phi_gpu_kernels.h.inc" diff --git a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc index 63869b7d7b9ea..6c0f6df892100 100644 --- a/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc +++ b/paddle/infrt/dialect/phi/pass/kernel_op_desc.cc @@ -41,26 +41,49 @@ TargetType cvtTargetFromPhi(phi::Backend backend) { } phi::DataType cvtPrecision2Phi(PrecisionType precision) { +#define CONVERT_PRECISION_TO_PHI(Precision) \ + case PrecisionType::Precision: \ + return phi::DataType::Precision; + switch (precision) { - case PrecisionType::FLOAT32: - return phi::DataType::FLOAT32; - break; - case PrecisionType::FLOAT16: - return phi::DataType::FLOAT16; + CONVERT_PRECISION_TO_PHI(FLOAT32) + CONVERT_PRECISION_TO_PHI(FLOAT16) + CONVERT_PRECISION_TO_PHI(FLOAT64) + CONVERT_PRECISION_TO_PHI(UINT8) + CONVERT_PRECISION_TO_PHI(INT8) + CONVERT_PRECISION_TO_PHI(INT16) + CONVERT_PRECISION_TO_PHI(INT32) + CONVERT_PRECISION_TO_PHI(INT64) + CONVERT_PRECISION_TO_PHI(COMPLEX64) + CONVERT_PRECISION_TO_PHI(COMPLEX128) + CONVERT_PRECISION_TO_PHI(BOOL) default: return phi::DataType::UNDEFINED; } +#undef CONVERT_PRECISION_TO_PHI } PrecisionType cvtPrecisionFromPhi(phi::DataType datatype) { +#define CONVERT_PRECISION_FROM_PHI(Precision) \ + case phi::DataType::Precision: \ + return PrecisionType::Precision; + switch (datatype) { - case phi::DataType::FLOAT32: - return PrecisionType::FLOAT32; - case phi::DataType::FLOAT16: - return PrecisionType::FLOAT16; + CONVERT_PRECISION_FROM_PHI(FLOAT32) + CONVERT_PRECISION_FROM_PHI(FLOAT16) + CONVERT_PRECISION_FROM_PHI(FLOAT64) + CONVERT_PRECISION_FROM_PHI(UINT8) + CONVERT_PRECISION_FROM_PHI(INT8) + CONVERT_PRECISION_FROM_PHI(INT16) + CONVERT_PRECISION_FROM_PHI(INT32) + CONVERT_PRECISION_FROM_PHI(INT64) + CONVERT_PRECISION_FROM_PHI(COMPLEX64) + CONVERT_PRECISION_FROM_PHI(COMPLEX128) + CONVERT_PRECISION_FROM_PHI(BOOL) default: return PrecisionType::UNK; } +#undef CONVERT_PRECISION_FROM_PHI } phi::DataLayout cvtLayout2Phi(LayoutType layout) { @@ -69,6 +92,8 @@ phi::DataLayout cvtLayout2Phi(LayoutType layout) { return phi::DataLayout::NCHW; case LayoutType::NHWC: return phi::DataLayout::NHWC; + case LayoutType::ANY: + return phi::DataLayout::ANY; default: return phi::DataLayout::UNDEFINED; } @@ -80,6 +105,8 @@ LayoutType cvtLayoutFromPhi(phi::DataLayout layout) { return LayoutType::NCHW; case phi::DataLayout::NHWC: return LayoutType::NHWC; + case phi::DataLayout::ANY: + return LayoutType::ANY; default: return LayoutType::UNK; } diff --git a/paddle/infrt/host_context/mlir_exec.cc b/paddle/infrt/host_context/mlir_exec.cc index 79717ba2cc034..7823681079f67 100644 --- a/paddle/infrt/host_context/mlir_exec.cc +++ b/paddle/infrt/host_context/mlir_exec.cc @@ -29,6 +29,7 @@ #include "paddle/infrt/kernel/tensor_shape_kernels.h" #include "paddle/infrt/kernel/test_kernels.h" #ifdef INFRT_WITH_PHI +#include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.h" #include "paddle/infrt/kernel/phi/registry.h" #endif @@ -58,6 +59,7 @@ int main(int argc, char** argv) { kernel::RegisterControlFlowKernels(®istry); #ifdef INFRT_WITH_PHI kernel::RegisterPhiKernels(®istry); + kernel::RegisterInferShapeLaunchers(®istry); #endif // load extra shared library diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index 5284f499916c3..3caaf1788e3f8 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -18,7 +18,13 @@ namespace infrt { namespace kernel { namespace phi { -::phi::CPUContext CreateCpuContext() { return {}; } +::phi::CPUContext CreateCpuContext( + infrt::backends::CpuPhiAllocator* allocator) { + ::phi::CPUContext context; + context.SetAllocator(allocator); + context.Init(); + return context; +} } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 8082dc6c2ff29..7f1e7ef6cd356 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/infrt/backends/host/phi_allocator.h" #include "paddle/infrt/backends/host/phi_context.h" #include "paddle/phi/core/dense_tensor.h" @@ -21,7 +22,7 @@ namespace infrt { namespace kernel { namespace phi { -::phi::CPUContext CreateCpuContext(); +::phi::CPUContext CreateCpuContext(::infrt::backends::CpuPhiAllocator*); } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc index ce9200b9918c0..871336e8762e8 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.cc +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/infrt/kernel/phi/dense_tensor_kernels.h" - +#include namespace infrt { namespace kernel { namespace phi { @@ -30,8 +30,38 @@ ::phi::DenseTensor CreateDenseTensorCpuF32Nchw( } void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values) {} + host_context::Attribute> values) { + auto place = ::phi::CPUPlace(); + float* a_data = dense_tensor->mutable_data(place); + for (int64_t i = 0; i < dense_tensor->numel(); ++i) { + a_data[i] = (values.get())[i]; + } +} +void PrintDenseTensor(::phi::DenseTensor* dense_tensor) { +#define PRINT_META_DATA(PHI_DATATYPE, DTYPE) \ + case ::phi::DataType::PHI_DATATYPE: { \ + DTYPE* data = dense_tensor->data(); \ + if (dense_tensor->numel() == 0) break; \ + std::cout << data[0]; \ + for (int64_t i = 1; i < dense_tensor->numel(); i++) { \ + std::cout << "," << data[i]; \ + } \ + break; \ + } + + ::phi::DDim dims = dense_tensor->dims(); + std::cout << "dense_tensor: shape=shape" << dims.to_str() << "," + << " values=["; + switch (dense_tensor->dtype()) { + PRINT_META_DATA(FLOAT32, float); + PRINT_META_DATA(INT32, int32_t); + default: + std::cout << "Error! Unsupported data type!\n"; + } + std::cout << "]\n"; +#undef PRINT_META_DATA +} } // namespace phi } // namespace kernel } // namespace infrt diff --git a/paddle/infrt/kernel/phi/dense_tensor_kernels.h b/paddle/infrt/kernel/phi/dense_tensor_kernels.h index 25daf7027e8cb..920c0b1c8af42 100644 --- a/paddle/infrt/kernel/phi/dense_tensor_kernels.h +++ b/paddle/infrt/kernel/phi/dense_tensor_kernels.h @@ -28,7 +28,8 @@ ::phi::DenseTensor CreateDenseTensorCpuF32Nchw( host_context::Attribute> lod); void FillDenseTensorF32(::phi::DenseTensor* dense_tensor, - host_context::Attribute> values); + host_context::Attribute> values); +void PrintDenseTensor(::phi::DenseTensor* dense_tensor); } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 2161e98fac833..37f9197edb728 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) { host_context::KernelRegistry registry; RegisterInferShapeLaunchers(®istry); ASSERT_GE(registry.size(), 1UL); - auto creator = registry.GetKernel("pten.add.cpu.any.fp32"); + auto creator = registry.GetKernel("phi_cpu.add.any.float32"); const phi::DDim dims({1, 2}); const phi::DataType dtype{phi::DataType::FLOAT32}; diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index 5d79814d4bec7..15e2d21005e03 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -42,6 +42,8 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { INFRT_KERNEL(infrt::kernel::phi::CreateDenseTensorCpuF32Nchw)); registry->AddKernel("phi_dt.fill_dense_tensor.f32", INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); + registry->AddKernel("phi_dt.print_tensor", + INFRT_KERNEL(infrt::kernel::phi::PrintDenseTensor)); registry->AddKernel( "phi_dt.fake_phi_kernel", std::bind(&KernelLauncherFunc !phi.CPU_allocator - %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context + %ctx = "phi_dt.create_context.cpu" (%allocator): (!phi.CPU_allocator) -> !phi.CPU_context %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) + "phi_dt.fill_dense_tensor.f32"(%t) {value=[3.8:f32]} : (!infrt.dense_tensor) -> () + %e = "phi_cpu.sign.any.float32"(%ctx, %t) : (!phi.CPU_context, !infrt.dense_tensor) -> (!infrt.dense_tensor) - // CHECK: @FakePhiKernel@ - %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) + // CHECK: dense_tensor: shape=shape[1], values=[1] + "phi_dt.print_tensor" (%e) : (!infrt.dense_tensor) -> () Infrt.return } diff --git a/paddle/scripts/infrt_build.sh b/paddle/scripts/infrt_build.sh index a0132501387e0..75b27e4165d17 100755 --- a/paddle/scripts/infrt_build.sh +++ b/paddle/scripts/infrt_build.sh @@ -33,14 +33,16 @@ function update_pd_ops() { rm -rf ${PADDLE_ROOT}/build && mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build cmake .. -DWITH_PYTHON=ON -DWITH_GPU=OFF -DPYTHON_EXECUTABLE=`which python3` -DWITH_XBYAK=OFF -DWITH_NCCL=OFF -DWITH_RCCL=OFF -DWITH_CRYPTO=OFF - make -j8 paddle_python + make -j8 paddle_python print_pten_kernels cd ${PADDLE_ROOT}/build + ./paddle/phi/tools/print_pten_kernels > ../tools/infrt/kernels.json cd python/dist/ python3 -m pip uninstall -y paddlepaddle python3 -m pip install *whl # update pd_ops.td cd ${PADDLE_ROOT}/tools/infrt/ python3 generate_pd_op_dialect_from_paddle_op_maker.py + python3 generate_phi_kernel_dialect.py ./kernels.json } function init() { diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py new file mode 100644 index 0000000000000..80cf3958b156d --- /dev/null +++ b/tools/infrt/generate_phi_kernel_dialect.py @@ -0,0 +1,276 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import sys + +attr_type_converter = {"i": 'SI32Attr', "b": 'BoolAttr', "l": 'SI64Attr'} +supported_kernels = ['sign', 'dot', 'digamma', 'conj'] + +target_type_converter = {"CPU": "CPU", "GPU": "GPU"} +layout_type_converter = { + "NCHW": "NCHW", + "NHWC": "NHWC", + "Undefined(AnyLayout)": "ANY" +} +precision_type_converter = { + "uint8": "UINT8", + "int8": "INT8", + "int16": "INT16", + "int32": "INT32", + "int64": "INT64", + "float16": "FLOAT16", + "bfloat16": "BFLOAT16", + "float32": "FLOAT32", + "float64": "FLOAT64", + "complex64": "COMPLEX64", + "complex128": "COMPLEX128", + "bool": "BOOL" +} + + +def generate_kernel_name(op_name, place_str): + [target_, layout_, precision_] = place_str[1:-1].split(',') + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + alias_ = "{}.{}".format(op_name, ".".join( + [target_.strip(), layout_.strip(), precision_.strip()])) + return alias_ + + +def generate_attrs_info(op_name, attrs_info): + kernel_attrs_names = { + 'split': ['sections', 'num', 'axis', 'mkldnn_data_type'], + 'sign': [], + 'masked_select': [], + 'trace': ['offset', 'axis1', 'axis2'], + 'concat': ['axis'], + 'empty': ['shape', 'dtype'], + 'conj': [], + 'norm': ['axis', 'epsilon', 'is_test'], + 'histogram': ['bins', 'min', 'max'], + 'dot': [], + 'scale': ['scale', 'bias', 'bias_after_scale'], + 'digamma': [], + 'lerp': [], + 'cast': ['out_dtype', 'in_dtype'], + 'abs': [] + } + attrs_args_ = "" + if len(kernel_attrs_names[op_name]) == len(attrs_info): + for index in range(len(attrs_info)): + attr_name = kernel_attrs_names[op_name][index] + attr_type = attr_type_converter[attrs_info[index]] + attrs_args_ += '{type_}:${name_},'.format( + type_=attr_type, name_=attr_name) + return attrs_args_[:-1] + + +def generate_inputs_info(input_info): + input_args_ = "" + for index in range(len(input_info)): + [target_, layout_, precision_] = input_info[index].split(',') + # todo: check vadility + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + input_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$in{},".format( + target_.strip(), precision_.strip(), layout_.strip(), str(index)) + input_args_ = input_args_[:-1] + return input_args_ + + +def generate_arguments_info(op_name, input_info, attr_info): + input_args = generate_inputs_info(input_info) + attr_args = generate_attrs_info(op_name, attr_info) + context_args = "CPU_Context:$dev_ctx" + argument_ = "{},{},{}".format(context_args, input_args, attr_args) + return (("let arguments = (ins {});".format(argument_.strip(",")))) + + +def generate_results_info(output_info): + output_args_ = "let results = (outs " + for index in range(len(output_info)): + [target_, layout_, precision_] = output_info[index].split(',') + # todo: check vadility + target_ = target_type_converter[target_.strip()] + layout_ = layout_type_converter[layout_.strip()] + precision_ = precision_type_converter[precision_.strip()] + output_args_ += " DenseTensor<\"{}\",\"{}\",\"{}\">:$out{},".format( + target_.strip(), precision_.strip(), layout_.strip(), str(index)) + return ("{});".format(output_args_[:-1])) + + +def generate_supported_kernel_list(load_dict): + supported_kernels_list_ = [] + for op_name in load_dict: + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + attributes = kernel_info[kernel_alias_]["attribute"] + flag = True + for attribute in attributes: + if attribute not in attr_type_converter: + flag = False + if flag: + supported_kernels_list_.append(op_name) + + alias_ = generate_kernel_dialect(op_name, kernel_alias_, + kernel_info[kernel_alias_]) + supported_kernels_list_ = list(set(supported_kernels_list_)) + print(supported_kernels_list_) + + +def scan_kernel_info(load_dict): + target_type_ = [] + layout_type_ = [] + precision_type_ = [] + for op_name in load_dict: + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + [target_, layout_, precision_] = kernel_alias_[1:-1].split(',') + target_type_.append(target_.strip()) + layout_type_.append(layout_.strip()) + precision_type_.append(precision_.strip()) + target_type_ = list(set(target_type_)) + layout_type_ = list(set(layout_type_)) + precision_type_ = list(set(precision_type_)) + print(target_type_) + print(layout_type_) + print(precision_type_) + + +def generate_cpu_kernel_dialect(op_name, kernel_alias_, kernel_info): + + alias = generate_kernel_name(op_name, kernel_alias_) + summary = 'let summary = "{name}";'.format(name=alias) + dialect_name = alias.split(".") + dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ + 3] + + header = 'def {kernel_name} : PDTCPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( + kernel_name=alias.replace(".", ""), + name=dialect_name.lower(), + left_brace="{") + + inputs_ = kernel_info["input"] + attributes = kernel_info["attribute"] + arguments = generate_arguments_info(op_name, inputs_, attributes) + + outputs = kernel_info["output"] + results = generate_results_info(outputs) + + kernel_dialect = '{header_}\n {summary_}\n {arguments_}\n {results_}\n{right_brace}\n'.format( + header_=header, + summary_=summary, + arguments_=arguments, + results_=results, + right_brace="}") + return kernel_dialect + + +def generate_gpu_kernel_dialect(op_name, kernel_alias_, kernel_info): + + alias = generate_kernel_name(op_name, kernel_alias_) + summary = 'let summary = "{name}";'.format(name=alias) + dialect_name = alias.split(".") + dialect_name = dialect_name[0] + "." + dialect_name[2] + "." + dialect_name[ + 3] + + header = 'def {kernel_name} : PDTGPU_Kernel<"{name}",[NoSideEffect]> {left_brace}'.format( + kernel_name=alias.replace(".", ""), + name=dialect_name.lower(), + left_brace="{") + inputs_ = kernel_info["input"] + attributes = kernel_info["attribute"] + arguments = generate_arguments_info(op_name, inputs_, attributes) + + outputs = kernel_info["output"] + results = generate_results_info(outputs) + + kernel_dialect = '{header_}\n {summary_}\n {arguments_}\n {results_}\n{right_brace}\n'.format( + header_=header, + summary_=summary, + arguments_=arguments, + results_=results, + right_brace="}") + return kernel_dialect + + +def generate_dialect_head(): + comment_ = "/*===- TableGen'source file -----------------------------------------------===*\\\n\ +|* *|\n\ +|* Kernel Definitions *|\n\ +|* *|\n\ +|* Automatically generated file, do not edit! *|\n\ +|* Generated by tools/infrt/generate_pten_kernel_dialect.py *|\n\ +|* *|\n\ +\*===----------------------------------------------------------------------===*/\n" + + includes_ = "#ifndef PTEN_KERNELS\n\ +#define PTEN_KERNELS\n\ +include \"mlir/Interfaces/InferTypeOpInterface.td\"\n\ +include \"mlir/Interfaces/LoopLikeInterface.td\"\n\ +include \"mlir/IR/OpBase.td\"\n\ +include \"paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td\"" + + return (comment_ + includes_) + + +def get_kernel_target(kernel_alias_): + target = kernel_alias_[1:-1].split(",") + return target[0] + + +def main(path_): + with open(path_, "r") as f: + load_dict = json.load(f) + + head = generate_dialect_head() + + cpu_registry_ = "" + gpu_registry_ = "" + for op_name in load_dict: + if op_name not in supported_kernels: + continue + kernel_list = load_dict[op_name] + for kernel_info in kernel_list: + for kernel_alias_ in kernel_info: + if get_kernel_target(kernel_alias_) == "CPU": + kernel_registry = generate_cpu_kernel_dialect( + op_name, kernel_alias_, kernel_info[kernel_alias_]) + cpu_registry_ += kernel_registry + elif get_kernel_target(kernel_alias_) == "GPU": + kernel_registry = generate_gpu_kernel_dialect( + op_name, kernel_alias_, kernel_info[kernel_alias_]) + gpu_registry_ += kernel_registry + else: + print("Unsupported backend:" + get_kernel_target( + kernel_alias_)) + end = "#endif // PTEN_KERNELS" + with open("../../paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td", + "w") as dst: + dst.write('{start_}\n{dialect_}\n{end_}'.format( + start_=head, dialect_=cpu_registry_, end_=end)) + with open("../../paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td", + "w") as dst: + dst.write('{start_}\n{dialect_}\n{end_}'.format( + start_=head, dialect_=gpu_registry_, end_=end)) + + +if __name__ == '__main__': + path = sys.argv[1] + main(path) diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index f3e9f345da27b..9ea3fef003054 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -150,19 +150,19 @@ def gen_dtype(vals: List[str]): ir_dtypes, origin_dtypes = [], [] for val in vals: if val == "float": - ir_dtypes.append("fp32") + ir_dtypes.append("float32") origin_dtypes.append("float") elif val == "double": - ir_dtypes.append("fp64") + ir_dtypes.append("float64") origin_dtypes.append("double") elif val == "float16": - ir_dtypes.append("fp16") + ir_dtypes.append("float16") origin_dtypes.append("paddle::experimental::float16") elif val == "bfloat16": ir_dtypes.append("bf16") origin_dtypes.append("paddle::experimental::bfloat16") elif val == "bool": - ir_dtypes.append("int1") + ir_dtypes.append("bool") origin_dtypes.append("bool") elif val == "int8_t": ir_dtypes.append("int8") @@ -219,8 +219,8 @@ def gen_register_info(resources: List[List[str]]): for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): kernel_func = gen_kernel_func(update_item[3], ctx_name, origin_dtype) - ir_name = 'pten.' + '.'.join( - [it.lower() for it in update_item[:3]]) + "." + ir_dtype + ir_name = 'phi_cpu.' + update_item[0].lower() + '.' + update_item[ + 2].lower() + '.' + ir_dtype res += f""" registry->AddKernel("{ir_name}",""" From f30b3f810d1b7e341507450313503cf4702f7d8a Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 2 Mar 2022 16:17:43 +0800 Subject: [PATCH 14/41] support checking `phi` directory in CI op benchmark (#40026) * support phi checking in CI op benchmark * add sparse/gpu * remove h file in cpu directory --- tools/ci_op_benchmark.sh | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh index 1db79418b2d8f..0937ebe5343fc 100644 --- a/tools/ci_op_benchmark.sh +++ b/tools/ci_op_benchmark.sh @@ -43,20 +43,33 @@ function match_cu_file_directory { do [ "${cu_file_dir}" == "paddle/fluid/operators${sub_dir}" ] && return 0 done - for sub_dir in "" "/gpu" "/hybird" + for sub_dir in "" "/gpu" "/gpudnn" "/sparse/gpu" do [ "${cu_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 0 done return 1 } +# Limit h file directory +function match_h_file_directory { + LOG "[INFO] run function match_h_file_directory" + local sub_dir h_file_dir + h_file_dir=$(dirname ${1}) + # '.h' file should not in directory below + for sub_dir in "" "/cpu" + do + [ "${h_file_dir}" == "paddle/phi/kernels${sub_dir}" ] && return 1 + done + return 0 +} + # Load op files by header file function load_CHANGE_OP_FILES_by_header_file { LOG "[INFO] run function load_CHANGE_OP_FILES_by_header_file" local change_file for change_file in $(grep -rl "${1}" paddle/fluid/operators paddle/phi/kernels/) do - if [[ "$change_file" =~ "_op.cu" ]] + if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]] then # match cu file directory limit match_cu_file_directory $change_file || continue @@ -64,6 +77,7 @@ function load_CHANGE_OP_FILES_by_header_file { CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" elif [[ "$change_file" =~ ".h" ]] then + match_h_file_directory $change_file || continue [ -n "${INCLUDE_SEARCH_MAP[$change_file]}" ] && continue LOG "[INFO] Found \"${1}\" include by \"${change_file}\", keep searching." INCLUDE_SEARCH_MAP[$change_file]="searched" @@ -82,7 +96,7 @@ function load_CHANGE_OP_FILES { # match directory limit [[ "$change_file" =~ "paddle/fluid/operators/" ]] || [[ "$change_file" =~ "paddle/phi/kernels/" ]] || continue # match file name limit - if [[ "$change_file" =~ "_op.cu" ]] + if [[ "$change_file" =~ "_op.cu" || "$change_file" =~ "_kernel.cu" || "$change_file" =~ "_kernel_gpudnn.cu" ]] then # match cu file directory limit match_cu_file_directory $change_file || continue @@ -90,6 +104,7 @@ function load_CHANGE_OP_FILES { CHANGE_OP_FILES[${#CHANGE_OP_FILES[@]}]="$change_file" elif [[ "$change_file" =~ ".h" ]] then + match_h_file_directory $change_file || continue LOG "[INFO] Found \"${change_file}\" changed, keep searching." INCLUDE_SEARCH_MAP[${change_file}]="searched" load_CHANGE_OP_FILES_by_header_file $change_file @@ -131,6 +146,8 @@ function load_CHANGE_OP_MAP { op_name=${change_file_name##*/} op_name=${op_name%_cudnn_op*} op_name=${op_name%_op*} + op_name=${op_name%_grad_kernel*} + op_name=${op_name%_kernel*} [ -n "${SKIP_OP_MAP[$op_name]}" ] && continue LOG "[INFO] Load op: \"${op_name}\"." CHANGE_OP_MAP[${op_name}]="$change_file" From 1c4e3e5dd0d32a4216bdad0b1cafcab4ca5ed5bb Mon Sep 17 00:00:00 2001 From: ziyoujiyi <73728031+ziyoujiyi@users.noreply.github.com> Date: Wed, 2 Mar 2022 16:23:52 +0800 Subject: [PATCH 15/41] new fleet_desc builder (#39948) * delete gloo connect retry * the_one_ps dirs reconstruct * . * . * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * create the_one_ps dirs * the one ps dirs modify * the one ps dirs modify * the one ps dirs modify * the one ps dirs modify * refactor ps optimize * refactor ps optimize * refactor ps optimize * . * . * . * . * . * . * refactor theoneps * the_one_ps * add ps pass unittest * add ps pass unittest * ps unitest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * ps unittest frame * add cpu_async_ps_mode test * add cpu_async_ps_mode test * add cpu_async_ps_mode test * ps unittest ready * ps unittest ready * solve dist_pass init conflict * solve import CommContext error * unittest ok * implement AllocateFrom * solve setup.py.in conflict * solve conflict * solve conflict * solve conflict * . * . * cpu-async-ps minimize test ok & gpu minimize test ok * add heter 2stage unittest * add heter 2stage unittest * add heter 2stage unittest * sync/geo test ok & fix heter_worker program ok * . * new fleet desc generator * new fleet_desc builder * new fleet_desc builder * . * . * correct ps.proto compile * . Co-authored-by: zkh2016 --- paddle/fluid/distributed/ps/ps.proto | 13 - paddle/fluid/framework/CMakeLists.txt | 5 +- paddle/fluid/framework/ps.proto | 213 ++++ .../fleet/meta_optimizers/ps_optimizer.py | 1 + python/paddle/distributed/ps/README.md | 3 - python/paddle/distributed/ps/the_one_ps.py | 1022 ++++++++--------- .../paddle/distributed/ps/utils/ps_factory.py | 4 +- .../ps/utils/ps_program_builder.py | 5 +- python/paddle/distributed/ps/utils/public.py | 4 +- .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../distributed_passes/ps_pass_test_base.py | 54 +- .../test_ps_trainer_pass.py | 122 +- .../fluid/tests/unittests/ps/CMakeLists.txt | 4 +- .../tests/unittests/ps/ps_dnn_trainer.py | 86 +- .../tests/unittests/ps/test_the_one_ps.py | 92 +- .../fluid/tests/unittests/ps_dnn_model.py | 1 + 16 files changed, 961 insertions(+), 670 deletions(-) delete mode 100755 paddle/fluid/distributed/ps/ps.proto mode change 100644 => 100755 paddle/fluid/framework/CMakeLists.txt create mode 100755 paddle/fluid/framework/ps.proto delete mode 100755 python/paddle/distributed/ps/README.md mode change 100644 => 100755 python/paddle/fluid/tests/unittests/CMakeLists.txt mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/CMakeLists.txt mode change 100644 => 100755 python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py diff --git a/paddle/fluid/distributed/ps/ps.proto b/paddle/fluid/distributed/ps/ps.proto deleted file mode 100755 index 2691f637527d4..0000000000000 --- a/paddle/fluid/distributed/ps/ps.proto +++ /dev/null @@ -1,13 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ \ No newline at end of file diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt old mode 100644 new mode 100755 index 14aecb5fd43c4..02d90b9c6da1e --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -235,6 +235,7 @@ if(WITH_PYTHON) py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) py_proto_compile(pass_desc_py_proto SRCS pass_desc.proto) + py_proto_compile(ps_py_proto SRCS ps.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. add_custom_target(fleet_proto_init ALL @@ -242,12 +243,13 @@ if(WITH_PYTHON) COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init pass_desc_py_proto ps_py_proto) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND cp ps_pb2.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_custom_target(fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto @@ -259,6 +261,7 @@ if(WITH_PYTHON) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto COMMAND copy /Y *.py ${proto_dstpath} + COMMAND copy /Y ps_pb2.py ${fleet_proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto." diff --git a/paddle/fluid/framework/ps.proto b/paddle/fluid/framework/ps.proto new file mode 100755 index 0000000000000..0ae87812bce43 --- /dev/null +++ b/paddle/fluid/framework/ps.proto @@ -0,0 +1,213 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto2"; +package paddle.distributed; +option cc_generic_services = true; +option cc_enable_arenas = true; + +message FsClientParameter { + enum FsApiType { + HDFS = 0; + AFS = 1; + } + optional FsApiType fs_type = 1 [ default = HDFS ]; + optional string uri = 2; // such as afs://xxx.afs.com:9902 + optional string user = 3; // user_name to access fs + optional string passwd = 4; // password + optional int32 buffer_size = 5; // buffer for read/write + optional string hadoop_bin = 51; + optional string afs_conf = 101; +} + +message PSParameter { + optional string worker_class = 1; + optional string server_class = 2; + optional string instance_class = 3; + optional string init_gflags = 4 [ default = "" ]; + optional WorkerParameter worker_param = 101; + optional ServerParameter server_param = 102; + repeated DownpourTrainerParameter trainer_param = 301; + optional FsClientParameter fs_client_param = 501; +} + +message WorkerParameter { + optional DownpourWorkerParameter downpour_worker_param = 1; +} + +message DownpourWorkerParameter { + repeated TableParameter downpour_table_param = 1; +} + +message DownpourServerParameter { + repeated TableParameter downpour_table_param = 1; + optional ServerServiceParameter service_param = 2; +} + +message ServerParameter { + optional DownpourServerParameter downpour_server_param = 1; +} + +message DownpourTrainerParameter { + repeated DenseTableParameter dense_table = 1; + repeated SparseTableParameter sparse_table = 2; + optional int32 push_sparse_per_batch = 3; + optional int32 push_dense_per_batch = 4; + repeated string skip_op = 5; + repeated ProgramConfig program_config = 6; +} + +message DenseTableParameter { + optional int32 table_id = 1; + repeated string dense_variable_name = 2; + repeated string dense_gradient_variable_name = 3; + optional int32 fea_dim = 4; +} + +message SparseTableParameter { + optional int32 table_id = 1; + optional int32 feature_dim = 2; + repeated string slot_key = 3; + repeated string slot_value = 4; + repeated string slot_gradient = 5; +} + +message ServerServiceParameter { + optional string server_class = 1 [ default = "BrpcPsServer" ]; + optional string client_class = 2 [ default = "BrpcPsClient" ]; + optional string service_class = 3 [ default = "BrpcPsService" ]; + optional uint32 start_server_port = 4 + [ default = 0 ]; // will find a avaliable port from it + optional uint32 server_thread_num = 5 [ default = 12 ]; +} + +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; +} + +enum TableType { + PS_SPARSE_TABLE = 0; + PS_DENSE_TABLE = 1; + PS_OTHER_TABLE = 2; +} + +message TableParameter { + optional uint64 table_id = 1; + optional string table_class = 2; + optional uint64 shard_num = 3 [ default = 1000 ]; + optional TableAccessorParameter accessor = 4; + optional TensorAccessorParameter tensor = 5; + optional CommonAccessorParameter common = 6; + optional TableType type = 7; + optional bool compress_in_save = 8 [ default = false ]; +} + +message TableAccessorParameter { + optional string accessor_class = 1; + optional uint32 fea_dim = 4 [ default = 11 ]; + optional uint32 embedx_dim = 5 [ default = 8 ]; + optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; + repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd +} + +message TensorAccessorParameter { + optional string feed_var_name = 1; + optional string fetch_var_name = 2; + optional int64 startup_program_id = 3; + optional int64 main_program_id = 4; + optional string tensor_table_class = 6; +} + +message CommonAccessorParameter { + optional string name = 1; + optional string table_name = 2; + repeated string attributes = 3; + repeated string params = 4; + repeated uint32 dims = 5; + repeated string initializers = 6; + optional string entry = 7; + optional int32 trainer_num = 8; + optional bool sync = 9; + optional uint32 table_num = 10; + optional uint32 table_dim = 11; +} + +message TableAccessorSaveParameter { + optional uint32 param = 1; + optional string converter = 2; + optional string deconverter = 3; +} + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py index 100a6882b1b35..00937dbe7a432 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py @@ -54,6 +54,7 @@ def _init_ps_pass_context(self, loss, startup_program): attrs['cloned_startup'] = attrs['origin_startup_program'].clone() attrs['user_defined_strategy'] = self.user_defined_strategy + attrs['valid_strategy'] = self.user_defined_strategy attrs['trainer'] = TrainerRuntimeConfig(self.user_defined_strategy) attrs['ps_mode'] = attrs['trainer'].mode logger.info("ps_mode: {}".format(attrs['ps_mode'])) diff --git a/python/paddle/distributed/ps/README.md b/python/paddle/distributed/ps/README.md deleted file mode 100755 index 8d28031794f5d..0000000000000 --- a/python/paddle/distributed/ps/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# 目录说明 - -* 改完之后,上层目录中 fleet 中相关文件(夹)就可以删除 diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 14a68ad916747..cc744bc9d9edb 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -15,10 +15,11 @@ import warnings import os +from paddle.distributed.fleet.proto import ps_pb2 import paddle.fluid as fluid import paddle.distributed.fleet as fleet from paddle.fluid import core -from .utils.public import * +from paddle.distributed.ps.utils.public import * from paddle.fluid.framework import Program from paddle.fluid.compiler import CompiledProgram from paddle.fluid.executor import Executor @@ -29,14 +30,10 @@ from paddle.fluid.communicator import Communicator, HeterClient from google.protobuf import text_format -__all__ = [] - - -def conv_indent(indent): - return "".join([" "] * indent) - - -PSERVER_SAVE_SUFFIX = ".shard" +__all__ = [ + 'Table', 'SparseTable', 'GeoSparseTable', 'BarrierTable', 'TensorTable', + 'DenseTable' +] def get_program_by_id(context, program_id): @@ -62,129 +59,140 @@ def parse_table_class(varname, program_id, context): return "MemorySparseTable" -def get_default_accessor_proto(accessor, varname, program_id, context): +def check_embedding_dim(accessor_proto, varname, program_id, context): main_program, startup_program = get_program_by_id(context, program_id) embedding_dim = 0 for var in main_program.list_vars(): if var.name == varname: embedding_dim = var.shape[1] + print('new var: {}, {}, {}'.format(var, embedding_dim, + accessor_proto.fea_dim)) break - - if not accessor.HasField("accessor_class"): - accessor.accessor_class = "CtrCommonAccessor" - if not accessor.HasField("fea_dim"): - accessor.fea_dim = embedding_dim + 2 - if not accessor.HasField("embedx_dim"): - accessor.embedx_dim = embedding_dim - 1 - if not accessor.HasField("embedx_threshold"): - accessor.embedx_threshold = 0 - - ctr_accessor_param = accessor.ctr_accessor_param - if not ctr_accessor_param.HasField("nonclk_coeff"): - ctr_accessor_param.nonclk_coeff = 0.1 - if not ctr_accessor_param.HasField("click_coeff"): - ctr_accessor_param.click_coeff = 1.0 - if not ctr_accessor_param.HasField("base_threshold"): - ctr_accessor_param.base_threshold = 0 - if not ctr_accessor_param.HasField("delta_threshold"): - ctr_accessor_param.delta_threshold = 0 - if not ctr_accessor_param.HasField("delta_keep_days"): - ctr_accessor_param.delta_keep_days = 16 - if not ctr_accessor_param.HasField("show_click_decay_rate"): - ctr_accessor_param.show_click_decay_rate = 1 - if not ctr_accessor_param.HasField("delete_threshold"): - ctr_accessor_param.delete_threshold = 0 - if not ctr_accessor_param.HasField("delete_after_unseen_days"): - ctr_accessor_param.delete_after_unseen_days = 30 - if not ctr_accessor_param.HasField("ssd_unseenday_threshold"): - ctr_accessor_param.ssd_unseenday_threshold = 1 - - for sgd_param in [accessor.embed_sgd_param, accessor.embedx_sgd_param]: - if not sgd_param.HasField("name"): - sgd_param.name = "SparseAdaGradSGDRule" - if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule": - if not sgd_param.adagrad.HasField("learning_rate"): - sgd_param.adagrad.learning_rate = 0.05 - if not sgd_param.adagrad.HasField("initial_g2sum"): - sgd_param.adagrad.initial_g2sum = 3.0 - if not sgd_param.adagrad.HasField("initial_range"): - sgd_param.adagrad.initial_range = 0.0001 - if len(sgd_param.adagrad.weight_bounds) == 0: - sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseNaiveSGDRule": - if not sgd_param.naive.HasField("learning_rate"): - sgd_param.naive.learning_rate = 0.05 - if not sgd_param.naive.HasField("initial_range"): - sgd_param.naive.initial_range = 0.0001 - if len(sgd_param.naive.weight_bounds) == 0: - sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) - if sgd_param.name == "SparseAdamSGDRule": - if not sgd_param.adam.HasField("learning_rate"): - sgd_param.adam.learning_rate = 0.001 - if not sgd_param.adam.HasField("initial_range"): - sgd_param.adam.initial_range = 0.0001 - if not sgd_param.adam.HasField("beta1_decay_rate"): - sgd_param.adam.beta1_decay_rate = 0.9 - if not sgd_param.adam.HasField("beta2_decay_rate"): - sgd_param.adam.beta2_decay_rate = 0.999 - if not sgd_param.adam.HasField("ada_epsilon"): - sgd_param.adam.ada_epsilon = 1e-08 - if len(sgd_param.adam.weight_bounds) == 0: - sgd_param.adam.weight_bounds.extend([-10.0, 10.0]) - - -def check_embedding_dim(accessor, varname, program_id, context): - main_program, startup_program = get_program_by_id(context, program_id) - embedding_dim = 0 - for var in main_program.list_vars(): - if var.name == varname: - embedding_dim = var.shape[1] - break - fea_dim = accessor.fea_dim + fea_dim = accessor_proto.fea_dim if fea_dim != embedding_dim + 2: raise ValueError( "The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}". format(embedding_dim + 2, fea_dim)) - embedx_dim = accessor.embedx_dim + embedx_dim = accessor_proto.embedx_dim if embedx_dim != embedding_dim - 1: raise ValueError( "The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}". format(embedding_dim - 1, embedx_dim)) +class Service: + def __init__(self): + pass + + def _set(self, service_proto): + service_proto.server_class = "BrpcPsServer" + service_proto.client_class = "BrpcPsClient" + service_proto.service_class = "BrpcPsService" + service_proto.start_server_port = 0 + service_proto.server_thread_num = 12 + + +class GpuService(Service): + def __init__(self): + super(GpuService).__init__(self) + + def _set(self, service_proto): + super(GpuService)._set(service_proto) + service_proto.server_class = 'PsLocalServer' + service_proto.client_class = 'PsLocalClient' + + class Accessor: def __init__(self): self.accessor_class = "" self.optimizer = None - self.feature_dim = -1 - self.embedding_dim = -1 - self.optimizer = None - - def to_string(self, indent): - accessor_str = "{}accessor {{{}\n{}}}" - attrs = "" - attrs += "accessor_class: \"{}\" ".format(self.accessor_class) - attrs += "fea_dim: {} ".format(self.feature_dim) - attrs += "embedx_dim: {} ".format(self.embedding_dim) - attrs += "\n" - if self.optimizer is not None: - attrs += self.optimizer.to_string(indent) - return accessor_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + self.feature_dim = 0 + self.embedding_dim = 0 + # TableAccessorParameter accessor + def _set(self, accessor_proto, varname, program_id, context): + main_program, startup_program = get_program_by_id(context, program_id) + embedding_dim = 0 + for var in main_program.list_vars(): + if var.name == varname: + embedding_dim = var.shape[1] + break -class CommonAccessor: + if not accessor_proto.HasField("accessor_class"): + accessor_proto.accessor_class = "CtrCommonAccessor" + if not accessor_proto.HasField("fea_dim"): + accessor_proto.fea_dim = embedding_dim + 2 + if not accessor_proto.HasField("embedx_dim"): + accessor_proto.embedx_dim = embedding_dim - 1 + if not accessor_proto.HasField("embedx_threshold"): + accessor_proto.embedx_threshold = 0 + + ctr_accessor_param = accessor_proto.ctr_accessor_param + if not ctr_accessor_param.HasField("nonclk_coeff"): + ctr_accessor_param.nonclk_coeff = 0.1 + if not ctr_accessor_param.HasField("click_coeff"): + ctr_accessor_param.click_coeff = 1.0 + if not ctr_accessor_param.HasField("base_threshold"): + ctr_accessor_param.base_threshold = 0 + if not ctr_accessor_param.HasField("delta_threshold"): + ctr_accessor_param.delta_threshold = 0 + if not ctr_accessor_param.HasField("delta_keep_days"): + ctr_accessor_param.delta_keep_days = 16 + if not ctr_accessor_param.HasField("show_click_decay_rate"): + ctr_accessor_param.show_click_decay_rate = 1 + if not ctr_accessor_param.HasField("delete_threshold"): + ctr_accessor_param.delete_threshold = 0 + if not ctr_accessor_param.HasField("delete_after_unseen_days"): + ctr_accessor_param.delete_after_unseen_days = 30 + if not ctr_accessor_param.HasField("ssd_unseenday_threshold"): + ctr_accessor_param.ssd_unseenday_threshold = 1 + + for sgd_param in [ + accessor_proto.embed_sgd_param, accessor_proto.embedx_sgd_param + ]: + if not sgd_param.HasField("name"): + sgd_param.name = "SparseAdaGradSGDRule" + if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule": + if not sgd_param.adagrad.HasField("learning_rate"): + sgd_param.adagrad.learning_rate = 0.05 + if not sgd_param.adagrad.HasField("initial_g2sum"): + sgd_param.adagrad.initial_g2sum = 3.0 + if not sgd_param.adagrad.HasField("initial_range"): + sgd_param.adagrad.initial_range = 0.0001 + if len(sgd_param.adagrad.weight_bounds) == 0: + sgd_param.adagrad.weight_bounds.extend([-10.0, 10.0]) + if sgd_param.name == "SparseNaiveSGDRule": + if not sgd_param.naive.HasField("learning_rate"): + sgd_param.naive.learning_rate = 0.05 + if not sgd_param.naive.HasField("initial_range"): + sgd_param.naive.initial_range = 0.0001 + if len(sgd_param.naive.weight_bounds) == 0: + sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) + if sgd_param.name == "SparseAdamSGDRule": + if not sgd_param.adam.HasField("learning_rate"): + sgd_param.adam.learning_rate = 0.001 + if not sgd_param.adam.HasField("initial_range"): + sgd_param.adam.initial_range = 0.0001 + if not sgd_param.adam.HasField("beta1_decay_rate"): + sgd_param.adam.beta1_decay_rate = 0.9 + if not sgd_param.adam.HasField("beta2_decay_rate"): + sgd_param.adam.beta2_decay_rate = 0.999 + if not sgd_param.adam.HasField("ada_epsilon"): + sgd_param.adam.ada_epsilon = 1e-08 + if len(sgd_param.adam.weight_bounds) == 0: + sgd_param.adam.weight_bounds.extend([-10.0, 10.0]) + + +class CommonAccessor(Accessor): def __init__(self): - self.accessor_class = "" - self.table_name = None - self.entry = None + super(CommonAccessor, self).__init__() + self.table_name = '' + self.entry = 'none' self.attrs = [] self.params = [] self.dims = [] self.trainer_num = 0 - self.sync = "false" - self.table_num = None - self.table_dim = None + self.sync = False self.initializers = [] self.opt_input_map = {} self.opt_attr_map = {} @@ -422,233 +430,361 @@ def parse_by_optimizer(self, ctx, context): self.initializers = initializers self.attrs = attrs - def to_string(self, indent): - accessor_str = "{}common {{{}\n{}}}" - attrs = "" - attrs += "name: \"{}\" ".format(self.accessor_class) - - if self.table_name: - attrs += "table_name: \"{}\" ".format(self.table_name) - - if self.entry: - attrs += "entry: \"{}\" ".format(self.entry) - attrs += "trainer_num: {} ".format(self.trainer_num) - attrs += "sync: {} ".format(self.sync) - if self.table_num: - attrs += "table_num: {} ".format(self.table_num) - if self.table_dim: - attrs += "table_dim: {} ".format(self.table_dim) - - for param in self.params: - attrs += "params: \"{}\" ".format(param) - - for dim in self.dims: - attrs += "dims: {} ".format(dim) - - for initializer in self.initializers: - attrs += "initializers: \"{}\" ".format(initializer) - - attrs += "\n" - return accessor_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + # CommonAccessorParameter common + def _set(self, proto): + proto.name = self.accessor_class + proto.table_name = self.table_name + proto.params.extend(self.params) + proto.dims.extend(self.dims) + proto.initializers.extend(self.initializers) + proto.entry = self.entry + proto.trainer_num = self.trainer_num + proto.sync = self.sync + proto.table_num = self.table_num + proto.table_dim = self.table_dim class Tensor: - def __init__(self): - self.main_program_id = None - self.startup_program_id = None - self.feed_var_name = None - self.fetch_var_name = None - self.tensor_table_class = False - - def to_string(self, indent): - program_str = "{}tensor {{{}\n{}}}" - attrs = "" - attrs += "feed_var_name: \"{}\" ".format(str(self.feed_var_name)) - attrs += "fetch_var_name: \"{}\" ".format(str(self.fetch_var_name)) - attrs += "startup_program_id: {} ".format(str(self.startup_program_id)) - attrs += "main_program_id: {} ".format(str(self.main_program_id)) - attrs += "tensor_table_class: \"{}\" ".format( - str(self.tensor_table_class)) - attrs += "\n" - return program_str.format( - conv_indent(indent), attrs, conv_indent(indent)) + def __init__(self, tesnor_dcit): + self.tensor_dict = tesnor_dcit + + def _set(self, tensor_proto): + tensor_proto.main_program_id = self.tensor_dict.get("main_program_id", + 0) + tensor_proto.startup_program_id = self.tensor_dict.get( + "startup_program_id", 0) + tensor_proto.feed_var_name = self.tensor_dict.get("feed_var_name", '') + tensor_proto.fetch_var_name = self.tensor_dict.get("fetch_var_name", '') + tensor_proto.tensor_table_class = self.tensor_dict.get( + "tensor_table_class", '') class Table: def __init__(self): - self.id = -1 self.table_class = None self.shard_num = -1 self.type = None - self.accessor = None - self.common = None + self.accessor = Accessor() + self.shard_num = 256 + self.common = CommonAccessor() self.tensor = None - self.accessor_proto = None - - def to_string(self, indent): - # if self.id == 1: - # proto_txt = '' - # with open('./sparse_table.prototxt') as f: - # proto_txt = f.read() - # return proto_txt - table_str = "{}downpour_table_param {{{}\n{}}}" - - attrs = "" - attrs += "table_id: {} ".format(self.id) - attrs += "table_class: \"{}\" ".format(self.table_class) - attrs += "shard_num: {} ".format(self.shard_num) - attrs += "type: {}".format(self.type) - attrs += "\n" - indent += 2 - - if self.accessor_proto is not None: - accessor_str = "{}accessor {{{}\n{}}}" - accessor_str = accessor_str.format( - conv_indent(indent), self.accessor_proto, conv_indent(indent)) - attrs += accessor_str + "\n" - elif self.accessor is not None: - attrs += self.accessor.to_string(indent) - attrs += "\n" - - if self.tensor is not None: - attrs += self.tensor.to_string(indent) - attrs += "\n" - - if self.common is not None: - attrs += self.common.to_string(indent) - attrs += "\n" - - return table_str.format(conv_indent(indent), attrs, conv_indent(indent)) + def _set(self, table_proto): + pass -class Service: - def __init__(self): - self.server_class = "BrpcPsServer" - self.client_class = "BrpcPsClient" - self.service_class = "BrpcPsService" - self.start_server_port = 0 - self.server_thread_num = 12 - def to_string(self, indent): - service_str = "{}service_param {{{}\n{}}}" +class BarrierTable(Table): + def __init__(self, context, idx): + super(BarrierTable, self).__init__() + self.type = None + self.shard_num = 256 + self.accessor.accessor_class = 'CommMergeAccessor' + self.common.attrs = "" + self.common.dims = [] + self.common.params = [] + self.is_heter_ps_mode = context['is_heter_ps_mode'] + self.role_maker = context['role_maker'] + self.idx = idx + self.is_sync = context['is_sync'] + + def _set(self, table_proto): + table_proto.table_id = self.idx + table_proto.table_class = 'BarrierTable' + table_proto.shard_num = 256 + table_proto.type = ps_pb2.PS_OTHER_TABLE + + table_proto.accessor.accessor_class = "CommMergeAccessor" + table_proto.accessor.fea_dim = 0 + table_proto.accessor.embedx_dim = 0 + + table_proto.common.name = "" + table_proto.common.table_name = "barrier_table" + table_proto.common.sync = self.is_sync + table_proto.common.entry = 'none' + + trainer_num = get_trainers(self.role_maker) + if self.is_heter_ps_mode: + trainer_num += len(self.role_maker._get_heter_worker_endpoints()) + table_proto.common.trainer_num = trainer_num - attrs = "" - attrs += "server_class: \"{}\" ".format(self.server_class) - attrs += "client_class: \"{}\" ".format(self.client_class) - attrs += "service_class: \"{}\" ".format(self.service_class) - attrs += "start_server_port: {} ".format(self.start_server_port) - attrs += "server_thread_num: {} ".format(self.server_thread_num) - return service_str.format( - conv_indent(indent), attrs, conv_indent(indent)) +class TensorTable(Table): + def __init__(self, idx, tensor_dict, role_maker): + super(TensorTable, self).__init__() + self.idx = idx + self.tensor_dict = tensor_dict + self.role_maker = role_maker + def _set(self, table_proto): + table_proto.table_id = self.idx + table_proto.type = ps_pb2.PS_OTHER_TABLE + table_proto.table_class = self.tensor_dict.get("tensor_table_class", '') -class DownpourServer: - def __init__(self): - self.service = None - self.tables = [] + table_proto.accessor.accessor_class = "CommMergeAccessor" - def set_service_param(self, service): - self.service = service + table_proto.common.table_name = self.tensor_dict.get("feed_var_name", + '') + table_proto.common.trainer_num = get_trainers(self.role_maker) - def append_tables(self, table): - if not isinstance(table, Table): - raise ValueError("only support instance Table") - self.tables.append(table) + tensor = Tensor(self.tensor_dict) + tensor._set(table_proto.tensor) - def to_string(self, indent): - server_str = "{}downpour_server_param {{{}\n{}}}" - table_strs = "" - indent += 2 +class SparseTable(Table): + def __init__(self, context, send_ctx): + super(SparseTable, self).__init__() + self.context = context + self.ctx = send_ctx + self.type = None + self.table_class = 'MemorySparseTable' + self.accessor = Accessor() - table_strs += "\n" - table_strs += self.service.to_string(indent) + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == False): + return + table_proto.table_id = ctx.table_id() + table_proto.table_class = self.table_class + table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.shard_num = self.shard_num + + self.common.table_name = self.context['grad_name_to_param_name'][ + ctx.origin_varnames()[0]] + + print('new table_name: {}'.format(self.common.table_name)) + all_table_proto = self.context[ + "user_defined_strategy"].sparse_table_configs + usr_table_proto = all_table_proto.add() + for proto in all_table_proto: + if proto.table_name == self.common.table_name: + usr_table_proto = proto + break + table_proto.table_class = 'MemorySparseTable' + warnings.warn("The PS mode must use MemorySparseTable.") + if usr_table_proto.HasField("shard_num"): + table_proto.shard_num = usr_table_proto.shard_num + else: + table_proto.shard_num = 1000 + warnings.warn( + "The shard_num of sparse table is not set, use default value 1000." + ) - for table in self.tables: - table_strs += "\n" - table_strs += table.to_string(indent) - return server_str.format( - conv_indent(indent), table_strs, conv_indent(indent)) + if usr_table_proto.accessor.ByteSize() == 0: + warnings.warn( + "The accessor of sparse table is not set, use default value.") + table_proto.accessor.ParseFromString( + usr_table_proto.accessor.SerializeToString()) + self.accessor._set(table_proto.accessor, self.common.table_name, + ctx.program_id(), self.context) -class Server: - def __init__(self): - self.servers = [] + check_embedding_dim(table_proto.accessor, self.common.table_name, + ctx.program_id(), self.context) - def add_server(self, server): - if not isinstance(server, DownpourServer): - raise ValueError("only support instance DownpourServer") - self.servers.append(server) + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = True if self.context['is_sync'] else False - def __str__(self): - server_str = "server_param {{{}\n}}" - indent = 2 - servers_str = "" - for server in self.servers: - servers_str += "\n" - servers_str += server.to_string(indent) + self.common._set(table_proto.common) - return server_str.format(servers_str) +class GeoSparseTable(SparseTable): + def __init__(self, context, send_ctx): + super(GeoSparseTable, self).__init__(context, send_ctx) + self.table_class = "SparseGeoTable" + if self.context['ps_mode'] != DistributedMode.GEO: + raise ValueError("not geo sparse table!") + + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == False): + return + table_proto.table_id = ctx.table_id() + table_proto.table_class = self.table_class + table_proto.type = ps_pb2.PS_SPARSE_TABLE + table_proto.shard_num = self.shard_num + + table_proto.accessor.accessor_class = 'CommMergeAccessor' + table_proto.accessor.fea_dim = ctx.sections()[0] + table_proto.accessor.embedx_dim = ctx.sections()[1] + + self.common.table_name = self.context['grad_name_to_param_name'][ + ctx.origin_varnames()[0]] + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = False + self.common._set(table_proto.common) + + +class DenseTable(Table): + def __init__(self, context, send_ctx): + super(DenseTable, self).__init__() + self.context = context + self.ctx = send_ctx + self.accessor = Accessor() -class DownpourWorker: + def _set(self, table_proto): + ctx = self.ctx + if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1 or ( + ctx.is_sparse() == True): + return + + table_proto.table_id = ctx.table_id() + + table_proto.type = ps_pb2.PS_DENSE_TABLE + table_proto.table_class = "CommonDenseTable" + table_proto.shard_num = 256 + + table_proto.accessor.accessor_class = 'CommMergeAccessor' + table_proto.accessor.fea_dim = ctx.sections()[0] + table_proto.accessor.embedx_dim = 1 + + self.common.table_name = "MergedDense" + adam_d2sum = self.context["user_defined_strategy"].adam_d2sum + self.common.parse_by_optimizer(ctx, self.context) + self.common.parse_entry(self.common.table_name, + ctx.program_id(), self.context) + self.common.sync = True if self.context['is_sync'] else False + + self.common._set(table_proto.common) + + +class Server: def __init__(self): - self.tables = [] + pass - def append_tables(self, table): - if not isinstance(table, Table): - raise ValueError("only support instance Table") - self.tables.append(table) + def _set(self): + pass - def to_string(self, indent): - worker_str = "{}downpour_worker_param {{{}\n{}}}" - table_strs = "" - indent += 2 - for table in self.tables: - table_strs += "\n" - table_strs += table.to_string(indent) - return worker_str.format( - conv_indent(indent), table_strs, conv_indent(indent)) +class DownpourServer(Server): + def __init__(self): + super(DownpourServer, self).__init__() + + def _set(self): + pass class Worker: def __init__(self): - self.workers = [] + pass - def add_worker(self, worker): - if not isinstance(worker, DownpourWorker): - raise ValueError("only support instance DownpourWorker") - self.workers.append(worker) + def _set(self): + pass - def __str__(self): - worker_str = "worker_param {{{}\n}}" - indent = 2 - workers_str = "" - for worker in self.workers: - workers_str += "\n" - workers_str += worker.to_string(indent) - return worker_str.format(workers_str) +class DownpourWorker(Worker): + def __init__(self): + super(DownpourWorker, self).__init__() + + def _set(self): + pass class fsClient: - def __init__(self, proto): - self.proto = proto - self.uri = proto.uri - self.user = proto.user - self.passwd = proto.passwd - self.hadoop_bin = proto.hadoop_bin - - def to_string(self): - proto_txt = text_format.MessageToString(self.proto) - if proto_txt: - fs_str = "fs_client_param {{\n{}}}" - return fs_str.format(proto_txt) + def __init__(self, fs_client_param): + self.fs_client_param = fs_client_param + + def _set(self, proto): + if not text_format.MessageToString(self.fs_client_param): + return + proto.uri = self.fs_client_param.uri + proto.user = self.fs_client_param.user + proto.passwd = self.fs_client_param.passwd + proto.hadoop_bin = self.fs_client_param.hadoop_bin + + +class PsDescBuilder(object): + def __init__(self, context): + self.context = context + self.is_sync = context['is_sync'] + self.ps_mode = context['ps_mode'] + self.is_heter_ps_mode = context['is_heter_ps_mode'] + self.use_ps_gpu = context['use_ps_gpu'] + self.send_ctx = get_the_one_send_context( + self.context, + use_origin_program=True, + split_dense_table=self.is_heter_ps_mode) + + self.tensor_table_dict = {} # TODO + self._server_sub_program = [] + + self.tables = self._get_tables() + + self.service = self._get_service() + self.fs_client = self._get_fs_client() + + self.ps_desc = ps_pb2.PSParameter() + + def _get_tensor_tables(self): + program_idx = 0 + if not self.tensor_table_dict: + self._server_sub_program.append(Program().desc) + tables = [] + for table_name in self.tensor_table_dict: + tables.append(globals()['TensorTable'](len(tables), tensor_dict, + self.context['role_maker'])) + program_idx += 1 + return tables + + def _get_tables(self): + tables = [] + for idx, (name, ctx) in enumerate(self.send_ctx.items()): + print('####### {}\n'.format(ctx.is_sparse())) + if ctx.is_sparse(): + if self.ps_mode == DistributedMode.GEO: + tables.append(globals()['GeoSparseTable'](self.context, + ctx)) + else: + tables.append(globals()['SparseTable'](self.context, ctx)) + else: + tables.append(globals()['DenseTable'](self.context, ctx)) + self.tensor_tables = self._get_tensor_tables() + tables.extend(self.tensor_tables) + tables.append(globals()['BarrierTable'](self.context, len(tables))) + return tables + + def _get_service(self): + if self.use_ps_gpu: + return GpuService() else: - return "" + return Service() + + def _get_fs_client(self): + return fsClient(self.context["user_defined_strategy"].fs_client_param) + + def build_worker_desc(self): + for table in self.tables: + table_proto = self.ps_desc.worker_param.downpour_worker_param.downpour_table_param.add( + ) + table._set(table_proto) + table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( + ) + table._set(table_proto) + self.service._set( + self.ps_desc.server_param.downpour_server_param.service_param) + return text_format.MessageToString(self.ps_desc) + + def build_server_desc(self): + for table in self.tables: + table_proto = self.ps_desc.server_param.downpour_server_param.downpour_table_param.add( + ) + table._set(table_proto) + self.sparse_table_maps = {} + if table_proto.type == ps_pb2.PS_SPARSE_TABLE and table_proto.common is not None: + self.sparse_table_maps[ + table_proto.common.table_name] = table_proto.table_id + + self.service._set( + self.ps_desc.server_param.downpour_server_param.service_param) + self.fs_client._set(self.ps_desc.fs_client_param) + return text_format.MessageToString(self.ps_desc) class TheOnePSRuntime(RuntimeBase): @@ -665,8 +801,11 @@ def _set_basic_info(self, context): self.role_maker = context["role_maker"] self.origin_main_program = context["origin_main_program"] - self.origin_main_programs = context["origin_main_programs"] - + self.origin_main_programs = context.get("origin_main_programs", + [self.origin_main_program]) + self.context["origin_main_programs"] = self.origin_main_programs + self.context["origin_startup_programs"] = context.get( + 'origin_startup_programs', [context['origin_startup_program']]) self.context[ 'is_heter_ps_mode'] = self.role_maker._is_heter_parameter_server_mode self.is_heter_ps_mode = self.context['is_heter_ps_mode'] @@ -675,15 +814,23 @@ def _set_basic_info(self, context): self.context['ps_mode'] = self.context['trainer'].mode self.context['use_ps_gpu'] = context['valid_strategy'].a_sync_configs[ 'use_ps_gpu'] - self.is_sync = True if self.context[ + self.context['is_sync'] = True if self.context[ 'ps_mode'] == DistributedMode.SYNC else False self.context['grad_name_to_param_name'] = {} self.context['tensor_table'] = {} build_var_distributed(self.context) + endpoints = get_ps_endpoints(self.role_maker) + self.string_hosts = [] + for idx, ep in enumerate(endpoints): + host, port = ep.split(":") + pshost = fluid.core.PSHost(host, int(port), idx) + self.string_hosts.append(pshost.serialize_to_string()) + + self.ps_desc_builder = PsDescBuilder(self.context) + def _init_worker(self): - worker = self._get_fleet_proto(is_server=False, is_sync=self.is_sync) - server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync) + worker_desc = self.ps_desc_builder.build_worker_desc() if self.context['use_ps_gpu']: main_program = self.context['loss'].block.program @@ -701,23 +848,11 @@ def sync_strategy_envs(): kwargs["trainer_id"] = self.role_maker._worker_index() return kwargs - proto_txt = str(worker) + "\n" + str(server) - with open('proto_txt', 'w') as f: - f.write(proto_txt) - + proto_txt = worker_desc + "\n" + server_desc debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) - if debug: print("worker: \n{}".format(proto_txt)) - endpoints = get_ps_endpoints(self.role_maker) - - string_hosts = [] - for idx, ep in enumerate(endpoints): - host, port = ep.split(":") - pshost = fluid.core.PSHost(host, int(port), idx) - string_hosts.append(pshost.serialize_to_string()) - dense_map = get_the_one_recv_context( self.context, split_dense_table=self.is_heter_ps_mode) send_ctx = get_the_one_send_context( @@ -741,7 +876,7 @@ def sync_strategy_envs(): kwargs["trainer_id"] = self.role_maker._role_id() kwargs["trainers"] = self.role_maker._worker_num() - for table in server.servers[0].tables: + for table in server.servers[0].tables: #TODO if table.table_class == "BarrierTable": kwargs["barrier_table_id"] = table.id break @@ -755,7 +890,8 @@ def sync_strategy_envs(): trainer_config.mode, kwargs, trainer_config.get_communicator_flags()) self._communicator.init_with_ctx(send_ctx, dense_map, proto_txt, - string_hosts, fluid.global_scope()) + self.string_hosts, + fluid.global_scope()) fleet.util.barrier() info = self._communicator.get_client_info() @@ -812,275 +948,16 @@ def sync_strategy_envs(): previous_trainers, self.role_maker._role_id()) - def _push_sparse_param(self, - var_name, - table_id=-1, - scope=fluid.global_scope()): - self._communicator.push_sparse_param(var_name, table_id, scope) - - def _get_executor(self): - executor = fluid.Executor(fluid.CPUPlace()) - if self.is_heter_ps_mode: - if self.role_maker._is_heter_worker(): - heter_device_type = self.role_maker._heter_device_type().upper() - if heter_device_type not in ["GPU", "XPU", "CPU"]: - raise ValueError("Heter Worker Not Support Device {}". - format(device_type)) - if heter_device_type == "GPU": - executor = Executor( - fluid.CUDAPlace( - int(os.getenv("FLAGS_selected_gpus", "0")))) - elif heter_device_type == "XPU": - executor = Executor( - fluid.XPUPlace( - int(os.getenv("FLAGS_selected_xpus", "0")))) - return executor - - def _get_fleet_proto(self, is_server, is_sync, **kwargs): - def _build_merge_accessor(ctx): - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - - if ctx.is_sparse(): - accessor.feature_dim = ctx.sections()[0] - accessor.embedding_dim = ctx.sections()[1] - else: - accessor.feature_dim = ctx.sections()[0] - accessor.embedding_dim = 1 - - return accessor - - def _build_barrier_table(idx): - table = Table() - table.id = idx - table.type = "PS_OTHER_TABLE" - table.table_class = "BarrierTable" - table.shard_num = 256 - - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - accessor.feature_dim = 0 - accessor.embedding_dim = 0 - table.accessor = accessor - - common = CommonAccessor() - common.table_name = "barrier_table" - trainer_num = get_trainers(self.context['role_maker']) - if self.is_heter_ps_mode: - trainer_num += len(self.role_maker._get_heter_worker_endpoints( - )) - common.trainer_num = trainer_num - common.attrs = "" - common.dims = [] - common.params = [] - table.common = common - return table - - def _build_tensor_table(idx, tensor_dict): - table = Table() - table.id = idx - table.type = "PS_OTHER_TABLE" - table.table_class = tensor_dict["tensor_table_class"] - table.shard_num = 256 - - accessor = Accessor() - accessor.accessor_class = "CommMergeAccessor" - accessor.optimizer = None - accessor.feature_dim = 0 - accessor.embedding_dim = 0 - table.accessor = accessor - - common = CommonAccessor() - common.table_name = tensor_dict["feed_var_name"] - common.trainer_num = get_trainers(self.role_maker) - common.attrs = "" - common.dims = [] - common.params = [] - table.common = common - - tensor = Tensor() - tensor.main_program_id = tensor_dict["main_program_id"] - tensor.startup_program_id = tensor_dict["startup_program_id"] - tensor.feed_var_name = tensor_dict["feed_var_name"] - tensor.fetch_var_name = tensor_dict["fetch_var_name"] - tensor.tensor_table_class = tensor_dict["tensor_table_class"] - table.tensor = tensor - - return table - - def _add_tensor_table(tables): - tensor_table_dict = {} - program_idx = 0 - for table_name in tensor_table_dict: - if tensor_table_dict[table_name]["startup_program"] != None: - tensor_table_dict[table_name][ - "startup_program_id"] = program_idx - self._server_sub_program.append(tensor_table_dict[ - table_name]["startup_program"].desc) - program_idx += 1 - if tensor_table_dict[table_name]["main_program"] != None: - tensor_table_dict[table_name][ - "main_program_id"] = program_idx - self._server_sub_program.append(tensor_table_dict[ - table_name]["main_program"].desc) - program_idx += 1 - # Todo: Hard code for lr_decay table apply table id - new_table = _build_tensor_table( - len(tables), tensor_table_dict[table_name]) - tables.append(new_table) - return tables - - def _get_tables(): - send_ctx = get_the_one_send_context( - self.context, - use_origin_program=True, - split_dense_table=self.is_heter_ps_mode) - - tables = [] - for idx, (name, ctx) in enumerate(send_ctx.items()): - print(" wxm python test send_ctx.items-->", idx, (name, ctx)) - if ctx.is_tensor_table() or len(ctx.origin_varnames()) < 1: - continue - - table = Table() - table.id = ctx.table_id() - common = CommonAccessor() - - if ctx.is_sparse(): - table.type = "PS_SPARSE_TABLE" - table.shard_num = 256 - - common.table_name = self.context['grad_name_to_param_name'][ - ctx.origin_varnames()[0]] - - if self.context['ps_mode'] == DistributedMode.GEO: - table.table_class = "SparseGeoTable" - else: - all_table_proto = self.context[ - "user_defined_strategy"].sparse_table_configs - table_proto = all_table_proto.add() - for proto in all_table_proto: - if proto.table_name == common.table_name: - table_proto = proto - break - if table_proto.HasField("table_class"): - table.table_class = table_proto.table_class - else: - table.table_class = parse_table_class( - common.table_name, - ctx.program_id(), self.context) - if table.table_class != 'MemorySparseTable': - table.table_class = 'MemorySparseTable' - warnings.warn( - "The PS mode must use MemorySparseTable.") - - if table_proto.HasField("shard_num"): - table.shard_num = table_proto.shard_num - else: - table.shard_num = 1000 - warnings.warn( - "The shard_num of sparse table is not set, use default value 1000." - ) - - if table_proto.accessor.ByteSize() == 0: - warnings.warn( - "The accessor of sparse table is not set, use default value." - ) - get_default_accessor_proto( - table_proto.accessor, common.table_name, - ctx.program_id(), self.context) - check_embedding_dim(table_proto.accessor, - common.table_name, - ctx.program_id(), self.context) - table.accessor_proto = text_format.MessageToString( - table_proto.accessor) - else: - table.type = "PS_DENSE_TABLE" - table.table_class = "CommonDenseTable" - table.shard_num = 256 - common.table_name = "MergedDense" - - adam_d2sum = self.context["user_defined_strategy"].adam_d2sum - common.parse_by_optimizer(ctx, self.context) - - if ctx.is_sparse(): - common.parse_entry(common.table_name, - ctx.program_id(), self.context) - - if is_sync: - common.sync = "true" - else: - common.sync = "false" - table.common = common - - if table.table_class != 'MemorySparseTable': - accessor = _build_merge_accessor(ctx) - table.accessor = accessor - tables.append(table) - - tensor_table_dict = {} - if len(tensor_table_dict) > 0: - tables = _add_tensor_table(tables) - else: - empty_porgram = Program() - self._server_sub_program.append(empty_porgram.desc) - - barrier_table = _build_barrier_table(len(tables)) - tables.append(barrier_table) - return tables - - if is_server: - server = Server() - downpour_server = DownpourServer() - - service = Service() - dist_strategy = self.context["valid_strategy"] - use_ps_gpu = dist_strategy.a_sync_configs["use_ps_gpu"] - if use_ps_gpu: - service.server_class = "PsLocalServer" - service.client_class = "PsLocalClient" - downpour_server.set_service_param(service) - - tables = _get_tables() - downpour_server.tables = tables - server.add_server(downpour_server) - return server - else: - worker = Worker() - downpour_worker = DownpourWorker() - - tables = _get_tables() - downpour_worker.tables = tables - worker.add_worker(downpour_worker) - return worker - def _init_server(self, dirname=None, var_names=None, **kwargs): + server_desc = self.ps_desc_builder.build_server_desc() role_id = get_role_id(self.role_maker) - endpoints = get_ps_endpoints(self.role_maker) trainers = get_trainers(self.role_maker) if self.is_heter_ps_mode: trainers += len(self.role_maker._get_heter_worker_endpoints()) - server = self._get_fleet_proto(is_server=True, is_sync=self.is_sync) - proto_txt = str(server) - fs_client = fsClient(self.context["user_defined_strategy"] - .fs_client_param) - proto_txt = proto_txt + "\n" + fs_client.to_string() - - debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) - if debug: - print("server: \n{}".format(proto_txt)) - - string_hosts = [] - for idx, ep in enumerate(endpoints): - host, port = ep.split(":") - pshost = fluid.core.PSHost(host, int(port), idx) - string_hosts.append(pshost.serialize_to_string()) self._server = fluid.core.DistFleetWrapper() - self._server.init_server(proto_txt, string_hosts, role_id, trainers, - self._server_sub_program) + self._server.init_server(server_desc, self.string_hosts, role_id, + trainers, self._server_sub_program) dist_varnames = get_sparse_tablenames(self.origin_main_programs, True) sparse_varnames = get_sparse_tablenames(self.origin_main_programs, @@ -1101,10 +978,7 @@ def _init_server(self, dirname=None, var_names=None, **kwargs): if dirname is None or not load_varnames: return - sparse_table_maps = {} - for table in server.servers[0].tables: - if table.type == "PS_SPARSE_TABLE" and table.common is not None: - sparse_table_maps[table.common.table_name] = table.id + sparse_table_maps = self.ps_desc_builder.sparse_table_maps dirname = os.path.normpath(dirname) pserver_id = self.role_maker._role_id() @@ -1186,7 +1060,7 @@ def _save_distributed_persistables(self, sparses = get_the_one_recv_context( self.context, is_dense=False, - split_dense_table=self.is_heter_ps_mod, + split_dense_table=self.is_heter_ps_mode, use_origin_program=True) sparse_varnames = self._save_sparse_params(executor, dirname, sparses, @@ -1413,7 +1287,7 @@ def _shrink(self, threshold=None): fleet.util.barrier() if self.role_maker._is_first_worker(): - sparses = sget_the_one_recv_context( + sparses = get_the_one_recv_context( self.context, is_dense=False, split_dense_table=self.role_maker. diff --git a/python/paddle/distributed/ps/utils/ps_factory.py b/python/paddle/distributed/ps/utils/ps_factory.py index 1a426f3ad6c6a..701ae8be6cb9c 100755 --- a/python/paddle/distributed/ps/utils/ps_factory.py +++ b/python/paddle/distributed/ps/utils/ps_factory.py @@ -38,5 +38,7 @@ def _create_ps_program_builder(self, pass_ctx): elif 'is_fl_ps_mode' in attrs and attrs[ 'is_fl_ps_mode'] == DistributedMode.FL: return globals()['FlPsProgramBuilder'](pass_ctx) - else: + elif attrs['ps_mode'] == DistributedMode.SYNC: return globals()['CpuSyncPsProgramBuilder'](pass_ctx) + else: + return globals()['CpuAsyncPsProgramBuilder'](pass_ctx) diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py index 25e4dc28bdcb8..d737542f32344 100755 --- a/python/paddle/distributed/ps/utils/ps_program_builder.py +++ b/python/paddle/distributed/ps/utils/ps_program_builder.py @@ -95,11 +95,12 @@ def _build_trainer_programs(self): class CpuSyncPsProgramBuilder(PsProgramBuilder): def __init__(self, pass_ctx): - logger.info("start building cpu-sync-ps program") super(CpuSyncPsProgramBuilder, self).__init__(pass_ctx) + if self.ps_mode == DistributedMode.SYNC: + logger.info("start building cpu-sync-ps program") if self.ps_mode != DistributedMode.SYNC and self.ps_mode != DistributedMode.ASYNC: raise ValueError("ps mode: {} not matched {}", - format(self.ps_mode, "CpuSyncPsProgramBuilder")) + format(self.ps_mode, "PsProgramBuilder")) def _build_trainer_programs(self): add_lr_decay_table_pass = new_pass("add_lr_decay_table_pass", diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py index ebec6900e38f5..ab5bd7da09dfc 100755 --- a/python/paddle/distributed/ps/utils/public.py +++ b/python/paddle/distributed/ps/utils/public.py @@ -73,7 +73,9 @@ def logger_config(log_path, logging_name): return logger -logger = logger_config(log_path='/ps_log', logging_name='ps_log') +ps_log_root_dir = '/ps_log/' +logger = logger_config( + log_path='/ps_usr_print_log', logging_name='ps_usr_print_log') class DistributedMode: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt old mode 100644 new mode 100755 index 2f6df075478e6..1443eebf29384 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -627,7 +627,7 @@ set_tests_properties(test_norm_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") set_tests_properties(test_nn_grad PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") if(WITH_DISTRIBUTE) add_subdirectory(distributed_passes) - + add_subdirectory(ps) add_subdirectory(auto_parallel) # FIXME(typhoonzero): add these tests back diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py index 63dd4b8e21e07..93a0044a5e43c 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/ps_pass_test_base.py @@ -23,13 +23,24 @@ import numpy as np from collections import OrderedDict from paddle.distributed.ps.utils.public import logger -from dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists +from paddle.fluid.tests.unittests.distributed_passes.dist_pass_test_base import prepare_python_path_and_return_module, remove_path_if_exists import paddle.distributed.fleet as fleet class PsPassTestBase(unittest.TestCase): def init(self): - raise NotImplementedError + self.config = {} + self.config['ps_mode_config'] = "" + self.config['worker_num'] = "1" + self.config['server_num'] = "1" + self.config['run_minimize'] = "0" + self.config['run_single_pass'] = "0" + self.config['run_the_one_ps'] = '0' + self.config['debug_new_minimize'] = "0" + self.config['debug_new_pass'] = "0" + self.config['debug_the_one_ps'] = '0' + self.config['log_dir'] = "" + self.config['applied_pass_name'] = "" def setUp(self): print('Ps setUp...') @@ -37,7 +48,7 @@ def setUp(self): def tearDown(self): print('Ps tearDown...') - def ps_launch(self, config, ps_mode="cpu-ps"): + def ps_launch(self, ps_mode="cpu-ps"): if ps_mode == "cpu-ps" or ps_mode == 'heter-ps': os.environ['WITH_DISTRIBUTE'] = 'ON' @@ -45,23 +56,26 @@ def ps_launch(self, config, ps_mode="cpu-ps"): sys.executable, "-u", ] + [ - "-m", "launch", "--log_dir", config['log_dir'], "--worker_num", - config['worker_num'], "--server_num", config['server_num'] + "-m", "launch", "--log_dir", self.config['log_dir'], + "--worker_num", self.config['worker_num'], "--server_num", + self.config['server_num'] ] if ps_mode == 'heter-ps': os.environ['FLAGS_START_PORT'] = '12004' cmd += [ - '--heter_worker_num', config['heter_worker_num'], - '--heter_devices', config['heter_devices'] + '--heter_worker_num', self.config['heter_worker_num'], + '--heter_devices', self.config['heter_devices'] ] cmd += [ - "../ps/ps_dnn_trainer.py", "-m", config['ps_mode_config'], - "--run_minimize", config['run_minimize'], "--run_single_pass", - config['run_single_pass'], "--debug_new_pass", - config['debug_new_pass'], "--debug_new_minimize", - config['debug_new_minimize'], "--applied_pass_name", - config['applied_pass_name'] + "../ps/ps_dnn_trainer.py", "-m", self.config['ps_mode_config'], + "--run_minimize", self.config['run_minimize'], + "--run_single_pass", self.config['run_single_pass'], + "--run_the_one_ps", self.config['run_the_one_ps'], + "--debug_new_pass", self.config['debug_new_pass'], + "--debug_new_minimize", self.config['debug_new_minimize'], + "--applied_pass_name", self.config['applied_pass_name'], + "--debug_the_one_ps", self.config['debug_the_one_ps'] ] elif ps_mode == "gpu-ps": os.environ['FLAGS_LAUNCH_BARRIER'] = '0' @@ -80,12 +94,14 @@ def ps_launch(self, config, ps_mode="cpu-ps"): cmd = [ sys.executable, "-u", "../ps/ps_dnn_trainer.py", "-m", - config['ps_mode_config'], "--run_minimize", - config['run_minimize'], "--run_single_pass", - config['run_single_pass'], "--debug_new_pass", - config['debug_new_pass'], "--debug_new_minimize", - config['debug_new_minimize'], "--applied_pass_name", - config['applied_pass_name'] + self.config['ps_mode_config'], "--run_minimize", + self.config['run_minimize'], "--run_single_pass", + self.config['run_single_pass'], "--run_the_one_ps", + self.config['run_the_one_ps'], "--debug_new_pass", + self.config['debug_new_pass'], "--debug_new_minimize", + self.config['debug_new_minimize'], "--applied_pass_name", + self.config['applied_pass_name'], "--debug_the_one_ps", + self.config['debug_the_one_ps'] ] cmd = [shlex.quote(c) for c in cmd] diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py index b186869ee9747..fd558ef040329 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py +++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py @@ -21,31 +21,26 @@ import paddle from ps_pass_test_base import * -from paddle.distributed.ps.utils.public import logger +from paddle.distributed.ps.utils.public import logger, ps_log_root_dir from paddle.fluid.tests.unittests.ps.ps_dnn_trainer import DnnTrainer class TestPsTrainerPass(PsPassTestBase): - def init(self): - self.config = {} - self.config['ps_mode_config'] = "" - self.config['worker_num'] = "1" - self.config['server_num'] = "1" - self.config['run_minimize'] = "0" - self.config['run_single_pass'] = "0" - self.config['debug_new_minimize'] = "0" - self.config['debug_new_pass'] = "0" - self.config['log_dir'] = "" - self.config['applied_pass_name'] = "" - def setUp(self): pass def tearDown(self): pass - def check(self): - pass + def check(self, file1, file2): + with open(file1, 'r', encoding='utf-8') as f: + text1 = f.read() + with open(file2, 'r', encoding='utf-8') as f: + text2 = f.read() + if text1 == text2: + return True + else: + return False def test_ps_optimizer_minimize_cpu_async(self): self.init() @@ -53,16 +48,21 @@ def test_ps_optimizer_minimize_cpu_async(self): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/async_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/async_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "async_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() - self.check() + file1 = '/ps_log/async_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/async_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_async passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_async failed!') def test_ps_optimizer_minimize_cpu_sync(self): self.init() @@ -70,16 +70,22 @@ def test_ps_optimizer_minimize_cpu_sync(self): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/sync_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/sync_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "sync_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) - - self.check() + self.ps_launch() + ''' + file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_sync passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_sync failed!') + ''' def test_ps_optimizer_minimize_cpu_geo(self): self.init() @@ -87,16 +93,21 @@ def test_ps_optimizer_minimize_cpu_geo(self): self.config['run_minimize'] = '1' self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/geo_cpu_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/geo_cpu_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config) + self.ps_launch() - self.check() + file1 = '/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_cpu_geo passed!') + else: + logger.error('test_ps_optimizer_minimize_cpu_geo failed!') # heter ps 二阶段 def test_ps_optimizer_minimize_heter(self): @@ -110,14 +121,24 @@ def test_ps_optimizer_minimize_heter(self): self.config['ps_mode_config'] = "../ps/heter_ps_config.yaml" self.config['debug_new_minimize'] = '0' - self.config['log_dir'] = "/heter_log_old_minimize" + self.config['log_dir'] = ps_log_root_dir + "heter_log_old_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, 'heter-ps') + self.ps_launch('heter-ps') self.config['debug_new_minimize'] = '1' - self.config['log_dir'] = "/heter_log_new_minimize" + self.config['log_dir'] = ps_log_root_dir + "heter_log_new_minimize" remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, 'heter-ps') + self.ps_launch('heter-ps') + ''' + file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt' + file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt' + file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt' + if self.check(file1, file2) and self.check(file3, file4): + logger.info('test_ps_optimizer_minimize_heter passed!') + else: + logger.error('test_ps_optimizer_minimize_heter failed!') + ''' def test_ps_optimizer_minimize_gpu(self): self.init() @@ -125,29 +146,42 @@ def test_ps_optimizer_minimize_gpu(self): self.config['ps_mode_config'] = "../ps/gpu_ps_config.yaml" self.config['debug_new_minimize'] = '0' - self.ps_launch(self.config, "gpu-ps") + self.ps_launch("gpu-ps") self.config['debug_new_minimize'] = '1' - self.ps_launch(self.config, "gpu-ps") + self.ps_launch("gpu-ps") - self.check() + file1 = '/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt' + file2 = '/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_ps_optimizer_minimize_gpu passed!') + else: + logger.error('test_ps_optimizer_minimize_gpu failed!') def test_append_send_ops_pass(self): self.init() self.config['run_single_pass'] = '1' + self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" self.config['applied_pass_name'] = "append_send_ops_pass" self.config['debug_new_pass'] = '0' - self.config['log_dir'] = "/log_old_" + self.config['applied_pass_name'] + self.config['log_dir'] = ps_log_root_dir + "log_old_" + self.config[ + 'applied_pass_name'] remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, "cpu-ps") + self.ps_launch("cpu-ps") self.config['debug_new_pass'] = '1' - self.config['log_dir'] = "/log_new_" + self.config['applied_pass_name'] + self.config['log_dir'] = ps_log_root_dir + "log_new_" + self.config[ + 'applied_pass_name'] remove_path_if_exists(self.config['log_dir']) - self.ps_launch(self.config, "cpu-ps") - - self.check() + self.ps_launch("cpu-ps") + + file1 = '/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt' + file2 = '/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt' + if self.check(file1, file2): + logger.info('test_append_send_ops_pass passed!') + else: + logger.info('test_append_send_ops_pass failed!') def test_distributed_ops_pass(self): pass diff --git a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt old mode 100644 new mode 100755 index 3aef3283b8200..9af32a8aca741 --- a/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ps/CMakeLists.txt @@ -3,6 +3,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + list(APPEND TEST_OPS ${TEST_OP}) + set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50) endforeach(TEST_OP) - -set_tests_properties(test_the_one_ps PROPERTIES TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py index d08c1d41c89ec..bc87fc255a59b 100755 --- a/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py +++ b/python/paddle/fluid/tests/unittests/ps/ps_dnn_trainer.py @@ -264,12 +264,16 @@ def parse_args(): '--run_minimize', type=int, default=0, help="test single pass") parser.add_argument( '--run_single_pass', type=int, default=0, help="test single pass") + parser.add_argument( + '--run_the_one_ps', type=int, default=0, help="test the_one_ps") parser.add_argument( '--debug_new_minimize', type=int, default=0, help="test single pass") parser.add_argument( '--debug_new_pass', type=int, default=0, help="test single pass") parser.add_argument( '--applied_pass_name', type=str, default="", help="test single pass") + parser.add_argument( + '--debug_the_one_ps', type=int, default=0, help="test the_one_ps") args = parser.parse_args() args.abs_dir = os.path.dirname(os.path.abspath(args.config_yaml)) @@ -280,9 +284,11 @@ def parse_args(): config["pure_bf16"] = args.pure_bf16 config['run_minimize'] = args.run_minimize config['run_single_pass'] = args.run_single_pass + config['run_the_one_ps'] = args.run_the_one_ps config['debug_new_minimize'] = args.debug_new_minimize config['debug_new_pass'] = args.debug_new_pass config['applied_pass_name'] = args.applied_pass_name + config['debug_the_one_ps'] = args.debug_the_one_ps yaml_helper.print_yaml(config) return config @@ -344,15 +350,15 @@ def run_minimize(self): fleet_obj.minimize(loss) if fleet.is_server(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_server_main.prototxt' debug_program(_main_file, loss.block.program) elif fleet.is_worker(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config['debug_new_minimize']) + '_worker_main.prototxt' debug_program(_main_file, loss.block.program) elif self.role_maker._is_heter_worker(): - _main_file = '/' + sync_mode + '_run_minimize' + '_debug:_' + str( + _main_file = ps_log_root_dir + sync_mode + '_run_minimize' + '_debug:_' + str( self.config[ 'debug_new_minimize']) + '_heter_worker_main.prototxt' debug_program(_main_file, loss.block.program) @@ -397,16 +403,84 @@ def run_single_pass(self): _main = worker.append_send_ops_pass(_main, compiled_config) if fleet.is_server(): - _main_file = '/' + sync_mode + "_" + str(config[ + _main_file = ps_log_root_dir + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_server_main.prototxt' debug_program(_main_file, _main) elif fleet.is_worker(): - _main_file = '/' + sync_mode + "_" + str(config[ + _main_file = ps_log_root_dir + sync_mode + "_" + str(config[ "applied_pass_name"]) + '_debug:_' + str(self.config[ 'debug_new_pass']) + '_worker_main.prototxt' debug_program(_main_file, _main) + def run_the_one_ps(self): + self.init_fleet_with_gloo() + self.model = get_model(self.config) + self.input_data = self.model.create_feeds() + self.metrics = self.model.net(self.input_data) + loss = self.model._cost + user_defined_strategy = get_user_defined_strategy(self.config) + learning_rate = self.config.get( + "hyper_parameters.optimizer.learning_rate") + sync_mode = self.config.get("runner.sync_mode") + inner_optimizer = paddle.optimizer.Adam(learning_rate, lazy_mode=True) + + self.role_maker._generate_role() # 必要 + if self.config['debug_the_one_ps'] == 1: + logger.info("entering run_the_one_ps -- new") + + from paddle.distributed.fleet.meta_optimizers.ps_optimizer import ParameterServerOptimizer + ps_optimizer = ParameterServerOptimizer(inner_optimizer) + ps_optimizer._set_basic_info(loss, self.role_maker, inner_optimizer, + user_defined_strategy) + ps_optimizer.minimize_impl(loss) + + from paddle.distributed.ps.the_one_ps import TheOnePSRuntime + _runtime_handle = TheOnePSRuntime() # ps 目录下重构版的 TheOnePSRuntime + _runtime_handle._set_basic_info(ps_optimizer.pass_ctx._attrs) + if fleet.is_worker(): + worker_desc = _runtime_handle.ps_desc_builder.build_worker_desc( + ) + with open(ps_log_root_dir + sync_mode + '_' + + 'new_worker_ps_desc', 'w') as f: + f.write(worker_desc) + if fleet.is_server(): + server_desc = _runtime_handle.ps_desc_builder.build_server_desc( + ) + with open(ps_log_root_dir + sync_mode + '_' + + 'new_server_ps_desc', 'w') as f: + f.write(server_desc) + + else: + pass + ''' + logger.info("entering run_the_one_ps -- old") + fleet_obj = fleet.distributed_optimizer( + inner_optimizer, user_defined_strategy) + fleet_obj.minimize(loss) + if fleet.is_worker(): + worker_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=False, is_sync=False) + server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) + with open(ps_log_root_dir + sync_mode + '_' + 'worker_ps_desc', 'w') as f: + f.write(str(worker_desc) + str(server_desc)) + if fleet.is_server(): + server_desc = fleet_obj._runtime_handle._get_fleet_proto(is_server=True, is_sync=False) + with open(ps_log_root_dir + sync_mode + '_' + 'server_ps_desc', 'w') as f: + f.write(str(server_desc) + str(fleet_obj._runtime_handle._get_fs_client_desc().to_string())) + ''' + if fleet.is_server(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_server_main.prototxt' + debug_program(_main_file, loss.block.program) + elif fleet.is_worker(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_worker_main.prototxt' + debug_program(_main_file, loss.block.program) + elif self.role_maker._is_heter_worker(): + _main_file = ps_log_root_dir + sync_mode + '_run_the_one_ps' + '_debug:_' + str( + self.config['debug_the_one_ps']) + '_heter_worker_main.prototxt' + debug_program(_main_file, loss.block.program) + if __name__ == "__main__": paddle.enable_static() @@ -418,3 +492,5 @@ def run_single_pass(self): benchmark_main.run_single_pass() elif config['run_minimize'] == 1: benchmark_main.run_minimize() + elif config['run_the_one_ps'] == 1: + benchmark_main.run_the_one_ps() diff --git a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py old mode 100644 new mode 100755 index 78bae0e50c580..8dddc6abd4ced --- a/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py +++ b/python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py @@ -22,16 +22,100 @@ import paddle import paddle.fluid as fluid +import paddle +from paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base import * +from paddle.distributed.ps.utils.public import logger, ps_log_root_dir +from ps_dnn_trainer import DnnTrainer +from paddle.distributed.fleet.proto import ps_pb2 +from google.protobuf import text_format + -class TestTheOnePs(unittest.TestCase): +class TestTheOnePs(PsPassTestBase): def setUp(self): - print('setUp...') + pass def tearDown(self): - print('tearDown...') + pass - def test_main(self): + def check(self, file1, file2): pass + ''' + f = open(file1, "rb") + ps_desc_1 = ps_pb2.PSParameter() + text_format.Parse(f.read(), ps_desc_1) + f.close() + + f = open(file2, "rb") + ps_desc_2 = ps_pb2.PSParameter() + text_format.Parse(f.read(), ps_desc_2) + f.close() + str1 = text_format.MessageToString(ps_desc_1) + str2 = text_format.MessageToString(ps_desc_2) + #logger.info('### msg10: {}'.format(str1)) + #logger.info('### msg20: {}'.format(str2)) + if str1 == str2: + return True + else: + return False + ''' + + def test_ps_cpu_async(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_async_ps_config.yaml" + self.config['run_the_one_ps'] = '1' + + self.config['debug_the_one_ps'] = '0' + self.config[ + 'log_dir'] = ps_log_root_dir + "async_cpu_log_old_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + self.config['debug_the_one_ps'] = '1' + self.config[ + 'log_dir'] = ps_log_root_dir + "async_cpu_log_new_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + desc1 = '/ps_desc_baseline/async_worker_ps_desc' + desc2 = '/ps_log/async_new_worker_ps_desc' + desc3 = '/ps_desc_baseline/async_server_ps_desc' + desc4 = '/ps_log/async_new_server_ps_desc' + if self.check(desc1, desc2): + logger.info('test_ps_cpu_async ps_desc: worker passed!') + else: + logger.info('test_ps_cpu_async ps_desc: worker failed!') + if self.check(desc3, desc4): + logger.info('test_ps_cpu_async ps_desc: server passed!') + else: + logger.info('test_ps_cpu_async ps_desc: server failed!') + + def test_ps_cpu_geo(self): + self.init() + self.config['ps_mode_config'] = "../ps/cpu_geo_ps_config.yaml" + self.config['run_the_one_ps'] = '1' + + self.config['debug_the_one_ps'] = '0' + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_old_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + self.config['debug_the_one_ps'] = '1' + self.config['log_dir'] = ps_log_root_dir + "geo_cpu_log_new_the_one_ps" + remove_path_if_exists(self.config['log_dir']) + self.ps_launch() + + desc1 = '/ps_desc_baseline/geo_worker_ps_desc' + desc2 = '/ps_log/geo_new_worker_ps_desc' + desc3 = '/ps_desc_baseline/geo_server_ps_desc' + desc4 = '/ps_log/geo_new_server_ps_desc' + if self.check(desc1, desc2): + logger.info('test_ps_cpu_geo ps_desc: worker passed!') + else: + logger.info('test_ps_cpu_geo ps_desc: worker failed!') + if self.check(desc3, desc4): + logger.info('test_ps_cpu_geo ps_desc: server passed!') + else: + logger.info('test_ps_cpu_geo ps_desc: server failed!') if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/ps_dnn_model.py b/python/paddle/fluid/tests/unittests/ps_dnn_model.py index 0a147334dab26..8d91e0f4678cb 100755 --- a/python/paddle/fluid/tests/unittests/ps_dnn_model.py +++ b/python/paddle/fluid/tests/unittests/ps_dnn_model.py @@ -74,6 +74,7 @@ def forward(self, sparse_inputs, dense_inputs): else: emb = self.embedding(s_input) emb = paddle.reshape(emb, shape=[-1, self.sparse_feature_dim]) + # emb.stop_gradient = True sparse_embs.append(emb) y_dnn = paddle.concat(x=sparse_embs + [dense_inputs], axis=1) From 28795771408a6dcd757ed367d348fb0ead5ab507 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 2 Mar 2022 16:40:05 +0800 Subject: [PATCH 16/41] run recompute's real backward with amp disabled (#40042) --- python/paddle/distributed/fleet/utils/recompute.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index dccd7f6205302..4ccb48ef72e71 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -182,9 +182,10 @@ def backward(ctx, *args): "none of output has requires_grad=True, this recompute() is not necessary" ) - # actually backward - paddle.autograd.backward(forward_outputs_with_grad, - backward_inputs_with_grad) + # actually backward + with paddle.amp.auto_cast(enable=False): + paddle.autograd.backward(forward_outputs_with_grad, + backward_inputs_with_grad) grads = list(inp._grad_ivar() for inp in detached_inputs if isinstance(inp, core.VarBase)) From 8492d3bbf6f01e98d6674b57b27913fe537584dd Mon Sep 17 00:00:00 2001 From: zhangkaihuo Date: Wed, 2 Mar 2022 16:43:52 +0800 Subject: [PATCH 17/41] The backward code of Sparse Conv3d (#40054) Sparse Conv3d backward code --- .../kernels/sparse/convolution_grad_kernel.h | 66 +++++++ paddle/phi/kernels/sparse/cpu/convolution.h | 1 + .../sparse/cpu/convolution_grad_kernel.cc | 166 ++++++++++++++++++ .../kernels/test_sparse_conv3d_dev_api.cc | 112 +++++++++++- 4 files changed, 337 insertions(+), 8 deletions(-) create mode 100644 paddle/phi/kernels/sparse/convolution_grad_kernel.h create mode 100644 paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc diff --git a/paddle/phi/kernels/sparse/convolution_grad_kernel.h b/paddle/phi/kernels/sparse/convolution_grad_kernel.h new file mode 100644 index 0000000000000..1a6ac852448a5 --- /dev/null +++ b/paddle/phi/kernels/sparse/convolution_grad_kernel.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/sparse_coo_tensor.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { +namespace sparse { + +template +void Conv3dGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + DenseTensor* x_grad, + DenseTensor* kernel_grad); + +template +std::vector Conv3dGrad(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups) { + DenseTensor x_grad = phi::Empty(dev_ctx); + DenseTensor kernel_grad = phi::Empty(dev_ctx); + Conv3dGradKernel(dev_ctx, + x, + rulebook, + kernel, + out_grad, + paddings, + dilations, + strides, + groups, + &x_grad, + &kernel_grad); + std::vector out(2); + out[0] = x_grad; + out[1] = kernel_grad; + return out; +} + +} // namespace sparse +} // namespace phi diff --git a/paddle/phi/kernels/sparse/cpu/convolution.h b/paddle/phi/kernels/sparse/cpu/convolution.h index 5803069d927d7..ab2fef5320f71 100644 --- a/paddle/phi/kernels/sparse/cpu/convolution.h +++ b/paddle/phi/kernels/sparse/cpu/convolution.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/sparse/convolution_kernel.h" namespace phi { namespace sparse { diff --git a/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc new file mode 100644 index 0000000000000..d4f770ce8713a --- /dev/null +++ b/paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/sparse/cpu/convolution.h" + +namespace phi { +namespace sparse { + +// rulebook: +//[ +// [kernel_index], +// [in_i], +// [out_i], +//] +// x_grad = out_grad * transpose(kenrel) +// kernel_grad = transpose(x) * out_grad +template +void Conv3dGradKernel(const Context& dev_ctx, + const SparseCooTensor& x, + const DenseTensor& rulebook, + const DenseTensor& kernel, + const SparseCooTensor& out_grad, + const std::vector& paddings, + const std::vector& dilations, + const std::vector& strides, + const int groups, + DenseTensor* x_grad, + DenseTensor* kernel_grad) { + const auto& kernel_dims = kernel.dims(); + const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; + const int in_channels = kernel_dims[3]; + const int out_channels = kernel_dims[4]; + const int* rulebook_ptr = rulebook.data(); + + const int rulebook_len = rulebook.dims()[1]; + + DenseTensorMeta in_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta d_x_features_meta( + x.dtype(), {rulebook_len, in_channels}, DataLayout::NCHW); + DenseTensorMeta out_grad_features_meta( + x.dtype(), {rulebook_len, out_channels}, DataLayout::NCHW); + phi::DenseTensor in_features = + phi::Empty(dev_ctx, std::move(in_features_meta)); + phi::DenseTensor d_x_features = + phi::Empty(dev_ctx, std::move(d_x_features_meta)); + phi::DenseTensor out_grad_features = + phi::Empty(dev_ctx, std::move(out_grad_features_meta)); + + dev_ctx.Alloc( + &in_features, in_features.dtype(), sizeof(T) * in_features.numel()); + T* in_features_ptr = in_features.data(); + dev_ctx.Alloc( + &d_x_features, d_x_features.dtype(), sizeof(T) * d_x_features.numel()); + T* d_x_features_ptr = d_x_features.data(); + dev_ctx.Alloc(&out_grad_features, + out_grad_features.dtype(), + sizeof(T) * out_grad_features.numel()); + T* out_grad_features_ptr = out_grad_features.data(); + kernel_grad->Resize(kernel_dims); + dev_ctx.Alloc( + kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T)); + T* d_kernel_ptr = kernel_grad->data(); + + Gather(x.non_zero_elements().data(), + rulebook_ptr + rulebook_len, + rulebook_len, + in_channels, + in_features_ptr); + Gather(out_grad.non_zero_elements().data(), + rulebook_ptr + rulebook_len * 2, + rulebook_len, + out_channels, + out_grad_features_ptr); + + auto blas = phi::funcs::GetBlas(dev_ctx); + std::vector offsets(kernel_size + 1), counter(kernel_size, 0); + for (int i = 0; i < rulebook_len; i++) { + counter[rulebook_ptr[i]] += 1; + } + int offset = 0; + for (int i = 0; i < kernel_size; i++) { + offsets[i] = offset; + offset += counter[i]; + } + offsets[kernel_size] = offset; + + const T* kernel_ptr = kernel.data(); + for (int i = 0; i < kernel_size; i++) { + if (counter[i] <= 0) { + continue; + } + + const int M = counter[i]; + const int K = in_channels; + const int N = out_channels; + T* tmp_in_ptr = in_features_ptr + offsets[i] * in_channels; + T* tmp_out_grad_ptr = out_grad_features_ptr + offsets[i] * out_channels; + const T* tmp_kernel_ptr = kernel_ptr + i * in_channels * out_channels; + T* tmp_d_x_ptr = d_x_features_ptr + offsets[i] * out_channels; + T* tmp_d_kernel_ptr = d_kernel_ptr + i * in_channels * out_channels; + + // call gemm: d_kernel = transpose(x) * out_grad + // (in_channels, n) * (n, out_channels) + blas.GEMM(CblasTrans, + CblasNoTrans, + M, + N, + K, + static_cast(1), + tmp_in_ptr, + tmp_out_grad_ptr, + static_cast(0), + tmp_d_kernel_ptr); + + // call gemm: d_x = out_grad * transpose(kernel) + // (n, out_channels) * (out_channels, in_channels) + blas.GEMM(CblasNoTrans, + CblasTrans, + M, + K, + N, + static_cast(1), + tmp_out_grad_ptr, + tmp_kernel_ptr, + static_cast(0), + tmp_d_x_ptr); + } + + // 4. scatter + x_grad->Resize(x.non_zero_elements().dims()); + dev_ctx.Alloc(x_grad, x_grad->dtype(), sizeof(T) * x_grad->numel()); + T* x_grad_values_ptr = x_grad->data(); + memset(x_grad_values_ptr, 0, sizeof(T) * x_grad->numel()); + Scatter(d_x_features_ptr, + rulebook.data() + rulebook_len, + rulebook_len, + in_channels, + x_grad_values_ptr); +} + +} // namespace sparse +} // namespace phi + +PD_REGISTER_KERNEL(sparse_conv_grad, + CPU, + ALL_LAYOUT, + phi::sparse::Conv3dGradKernel, + float, + double) { + kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO); + kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO); +} diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc index 576015143704b..00b2a256a9504 100644 --- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -59,7 +60,10 @@ void TestConv3dBase(const std::vector& indices, const std::vector& paddings, const std::vector& strides, const std::vector& dilations, - const float diff = 1e-3) { + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}, + const std::vector kernel_grad = {}) { phi::CPUContext dev_ctx_cpu; dev_ctx_cpu.SetAllocator( paddle::memory::allocation::AllocatorFacade::Instance() @@ -122,10 +126,29 @@ void TestConv3dBase(const std::vector& indices, correct_out_indices.size() * sizeof(int)); ASSERT_EQ(cmp_indices, 0); - for (uint64_t i = 0; i < correct_out_features.size(); i++) { - float tmp = std::fabs(static_cast( - correct_out_features[i] - out.non_zero_elements().data()[i])); - ASSERT_LT(tmp, diff); + auto f_verify = [&](const T* real_data, + const std::vector& correct_data) { + for (uint64_t i = 0; i < correct_data.size(); i++) { + float tmp = + std::fabs(static_cast(correct_data[i] - real_data[i])); + ASSERT_LT(tmp, diff); + } + }; + + f_verify(out.non_zero_elements().data(), correct_out_features); + + if (backward) { + std::vector grads = sparse::Conv3dGrad(dev_ctx_cpu, + x_tensor, + rulebook, + kernel_tensor, + out, + paddings, + dilations, + strides, + 1); + f_verify(grads[0].data(), features_grad); + f_verify(grads[1].data(), kernel_grad); } } } @@ -141,7 +164,11 @@ void TestConv3d(const std::vector& indices, const int non_zero_num, const std::vector& paddings, const std::vector& strides, - const std::vector& dilations) { + const std::vector& dilations, + const float diff = 1e-3, + const bool backward = false, + const std::vector features_grad = {}, + const std::vector kernel_grad = {}) { // test float TestConv3dBase(indices, features, @@ -154,7 +181,11 @@ void TestConv3d(const std::vector& indices, non_zero_num, paddings, strides, - dilations); + dilations, + diff, + backward, + features_grad, + kernel_grad); // test double TestConv3dBase(indices, cast(features), @@ -167,7 +198,11 @@ void TestConv3d(const std::vector& indices, non_zero_num, paddings, strides, - dilations); + dilations, + diff, + backward, + cast(features_grad), + cast(kernel_grad)); } TEST(DEV_API, sparse_conv3d) { @@ -467,5 +502,66 @@ TEST(DEV_API, sparse_conv2d) { dilations); } +TEST(DEV_API, sparse_conv3d_backward) { + const int in_channels = 1; + const int out_channels = 1; + DDim x_dims = {1, 4, 4, 4, in_channels}; + DDim kernel_dims = {3, 3, 3, in_channels, out_channels}; + DDim out_dims = {1, 2, 2, 2, out_channels}; + std::vector paddings = {0, 0, 0}; + std::vector strides = {1, 1, 1}; + std::vector dilations = {1, 1, 1}; + + const int non_zero_num = 2; + std::vector indices_flatten = {0, 0, 0, 2, 3, 2, 3, 2}; + + std::vector features = {-0.28833008, 0.0287323}; + // 3*3*3=27 + std::vector kernel = { + 0.64306641, 0.45043945, 0.47216797, 0.22924805, 0.97509766, 0.86181641, + 0.57861328, 0.91796875, 0.87255859, 0.16589355, 0.44555664, 0.01889038, + 0.46459961, 0.44726562, 0.19909668, 0.89697266, 0.37158203, 0.00513077, + 0.69628906, 0.26904297, 0.74707031, 0.54003906, 0.5390625, 0.07958984, + 0.47338867, 0.90966797, 0.17126465}; + + std::vector out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, + 1, 1, 0, 1, 0, 1, 0, 1, 0, 1}; + + std::vector out_features = {4.9200e-03, + 2.6140e-02, + 2.2900e-03, + -2.3596e-01, + 1.5000e-04, + 1.0670e-02, + 5.7200e-03, + 1.2850e-02}; + + std::vector features_grad = {-0.20593, -0.09149}; + std::vector kernel_grad = { + 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, + 0.000e+00, 0.000e+00, 6.805e-02, 0.000e+00, 0.000e+00, 0.000e+00, + 0.000e+00, 3.700e-04, 1.600e-04, 0.000e+00, 3.100e-04, 0.000e+00, + 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, -6.780e-03, 7.000e-05, + 0.000e+00, 7.500e-04, 1.400e-04}; + + TestConv3d(indices_flatten, + features, + x_dims, + kernel, + kernel_dims, + out_indices_flatten, + out_features, + out_dims, + non_zero_num, + paddings, + strides, + dilations, + 1e-3, + true, + features_grad, + kernel_grad); +} + } // namespace tests } // namespace phi From 2a5590a18e3dd90f815f20a82f6dcc722bc17892 Mon Sep 17 00:00:00 2001 From: From00 Date: Wed, 2 Mar 2022 16:55:19 +0800 Subject: [PATCH 18/41] Move BroadcastTensors OP to phi (#40047) * Move BroadcastTensors OP to phi * Remove mutable_data in impl * Move BilinearTensorProductInferMeta to multiary.h/cc --- .../fluid/operators/broadcast_tensors_op.cc | 99 +----- .../fluid/operators/broadcast_tensors_op.cu | 122 -------- paddle/fluid/operators/broadcast_tensors_op.h | 282 ------------------ paddle/phi/infermeta/multiary.cc | 66 +++- paddle/phi/infermeta/multiary.h | 5 + .../kernels/broadcast_tensors_grad_kernel.h | 27 ++ paddle/phi/kernels/broadcast_tensors_kernel.h | 27 ++ paddle/phi/kernels/complex_grad_kernel.h | 2 +- paddle/phi/kernels/complex_kernel.h | 14 +- .../cpu/broadcast_tensors_grad_kernel.cc | 201 +++++++++++++ .../kernels/cpu/broadcast_tensors_kernel.cc | 30 ++ .../gpu/broadcast_tensors_grad_kernel.cu | 111 +++++++ .../kernels/gpu/broadcast_tensors_kernel.cu | 30 ++ .../impl/broadcast_tensors_kernel_impl.h | 118 ++++++++ .../phi/ops/compat/broadcast_tensors_sig.cc | 28 ++ 15 files changed, 658 insertions(+), 504 deletions(-) delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.cu delete mode 100644 paddle/fluid/operators/broadcast_tensors_op.h create mode 100644 paddle/phi/kernels/broadcast_tensors_grad_kernel.h create mode 100644 paddle/phi/kernels/broadcast_tensors_kernel.h create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu create mode 100644 paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h create mode 100644 paddle/phi/ops/compat/broadcast_tensors_sig.cc diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index 27b1107675d4e..c3917fad555cb 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -31,64 +27,6 @@ class BroadcastTensorsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); - OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", - "broadcast_tensors"); - - int target_rank = 0; - const auto& input_dims = ctx->GetInputsDim("X"); - - // 1. Find Output rank = max(Inputs rank) - for (const auto& input_ddim : input_dims) { - target_rank = std::max(target_rank, input_ddim.size()); - } - - PADDLE_ENFORCE_GT( - target_rank, 0, - platform::errors::InvalidArgument( - "BroadcastTensorsOp requires at least one input tensor" - "to have rank greater than zero")); - - std::vector target_dims(target_rank, 0); - // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) - for (int index = 0; index < target_rank; index++) { - // Loop axes in reverse order, - // For each axis, take the maximum as target size - // Fill size = 1 if shape vector exhausts - int target_dim_size = 1; - for (const auto& input_ddim : input_dims) { - // Reversed order - int axis = static_cast(input_ddim.size()) - index - 1; - int dim_size = 1; - if (axis >= 0) { - dim_size = input_ddim[axis]; - } - - if (target_dim_size != 1 && dim_size != 1 && - target_dim_size != dim_size) { - PADDLE_THROW(platform::errors::InvalidArgument( - "BroadcastTensorsOp inputs does not satisfy bcast semantics," - "Please check axis = %d in reverse order", - index)); - } - - // We performed bcast semantics check at python level - // So input tensors should all have legal shape - target_dim_size = std::max(target_dim_size, dim_size); - } - target_dims[target_rank - index - 1] = target_dim_size; - } - - // 3. Set Output Dim - std::vector output_ddims; - for (size_t i = 0; i < input_dims.size(); i++) { - output_ddims.emplace_back(phi::make_ddim(target_dims)); - } - ctx->SetOutputsDim("Out", output_ddims); - ctx->ShareAllLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -229,34 +167,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, + BroadcastTensorsInferShapeFunctor, + PT_INFER_META(phi::BroadcastTensorsInferMeta)); + REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, ops::BroadcastTensorsGradOpMaker, ops::BroadcastTensorsGradOpMaker, - ops::BroadcastTensorsOpVarTypeInference); + ops::BroadcastTensorsOpVarTypeInference, + BroadcastTensorsInferShapeFunctor); REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, ops::BroadcastTensorsGradOpVarTypeInference, ops::BroadcastTensorsGradNoNeedBufVarsInferer); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CPU_KERNEL( - broadcast_tensors_grad, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel, - ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu deleted file mode 100644 index 5882258317d7d..0000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.cu +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/broadcast_tensors_op.h" - -#include -#include -#include -#include -#include - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; - -template -class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const DDim& input_dims = input_tensor->dims(); - const DDim& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // Collect reduce_dims - // Example: - // dX = [1,1,1,1] - // dOut = [1,1,1,4] - // - // reduce_dims = [3] // reduce along the broadcasted axis - std::vector reduce_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // Turns out to be a No-Op, simply copy tensors - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - // reduce_sum implementation on CUDA - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), *input_tensor, output_tensor, - kps::IdentityFunctor(), reduce_dims_vec, stream); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - broadcast_tensors, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel, - ops::BroadcastTensorsOpKernel); - -REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel, - ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h deleted file mode 100644 index 682f2e2476922..0000000000000 --- a/paddle/fluid/operators/broadcast_tensors_op.h +++ /dev/null @@ -1,282 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -#define SWITCH_OUT_RANK_CASE(n) \ - case n: { \ - ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ - break; \ - } - -namespace paddle { -namespace operators { - -using framework::Tensor; -using framework::DDim; -using framework::EigenTensor; - -template -class BroadcastTensorsOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const auto& in_tensors = context.MultiInput("X"); - auto out_tensors = context.MultiOutput("Out"); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // Eigen has no support for dynamic ranked tensor - // Thus we perform static expansion for each possible ranks - for (size_t i = 0; i < num_ins; i++) { - int out_rank = out_tensors[i]->dims().size(); - switch (out_rank) { - SWITCH_OUT_RANK_CASE(1) - SWITCH_OUT_RANK_CASE(2) - SWITCH_OUT_RANK_CASE(3) - SWITCH_OUT_RANK_CASE(4) - SWITCH_OUT_RANK_CASE(5) - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Target tensor rank out of range" - "Maximum supported rank for broadcast is: 5")); - } - } - } - } - - template - void ApplyBroadcast(const framework::ExecutionContext& context, - const Tensor* input_tensor, Tensor* output_tensor) const { - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // 1. Collect bcast_dims, each element of which indicates how many - // times we need to replicate along the corresponding dimension - // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for - // both input and output tensors, so we need to initialize input X with - // expanded dims: "new_input_dims_vec" - Eigen::DSizes bcast_dims; - std::vector new_input_dims_vec(out_rank); - for (int j = 0; j < out_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - bcast_dims[out_axis] = output_dims[out_axis]; - new_input_dims_vec[out_axis] = 1; - if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { - bcast_dims[out_axis] = 1; - new_input_dims_vec[out_axis] = input_dims[in_axis]; - } - } - auto new_input_dims = phi::make_ddim(new_input_dims_vec); - - // Initialize input X with new_input_dims_vec, so it's rank-aligned with the - // output - auto x = EigenTensor::From(*input_tensor, new_input_dims); - - output_tensor->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*output_tensor, output_dims); - - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, OutRank>::Eval(place, y, x, - bcast_dims); - } -}; - -#define SWITCH_RESHAPE_DIMS(n) \ - case n: { \ - Eigen::DSizes reshape_dims; \ - for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ - reshape_dims[i] = reshape_dims_vec[i]; \ - } \ - dX.device(place) = \ - dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ - break; \ - } - -#define UPPER_SWITCH_REDUCE_DIMS(m) \ - case m: { \ - Eigen::DSizes reduce_dims; \ - for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ - reduce_dims[i] = reduce_dims_vec[i]; \ - } \ - switch (reshape_size) { -#define LOWER_SWITCH_REDUCE_DIMS \ - default: { \ - PADDLE_THROW(platform::errors::InvalidArgument( \ - "Detected reshape size: %d out of range" \ - "Minimum value should be larger than reduce size %d" \ - "While maximum supported is: 5", \ - reshape_size, reduce_size)); \ - } \ - } \ - break; \ - } - -/* ----- GradOpKernel ----- */ -template -class BroadcastTensorsGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Find reduce dimensions - const auto& in_tensors = - context.MultiInput(framework::GradVarName("Out")); - auto out_tensors = context.MultiOutput(framework::GradVarName("X")); - - size_t num_ins = in_tensors.size(); - - PADDLE_ENFORCE_GT( - num_ins, 1, - platform::errors::InvalidArgument( - "Expected at least 2 input tensors, but only received d%.", - in_tensors.size())); - - PADDLE_ENFORCE_EQ( - num_ins, out_tensors.size(), - platform::errors::InvalidArgument( - "BroadcastTensorsOp expects equal number of inputs and outputs," - "but received: %d inputs v.s %d outputs", - num_ins, out_tensors.size())); - - // For each In-Out tensor pair, - // Prepare and apply broadcast dims array - for (size_t i = 0; i < num_ins; i++) { - const auto* input_tensor = in_tensors[i]; - auto* output_tensor = out_tensors[i]; - - const auto& input_dims = input_tensor->dims(); - const auto& output_dims = output_tensor->dims(); - - int in_rank = input_dims.size(); - int out_rank = output_dims.size(); - - // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes - // Here we perform the following Eigen operations: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - // Note the last "reshape(dX_shape)" will be performed implicitly, - // and we only need to collect reduce_dims and reshape_dims - std::vector reduce_dims_vec; - std::vector reshape_dims_vec; - for (int j = 0; j < in_rank; j++) { - int out_axis = out_rank - j - 1; - int in_axis = in_rank - j - 1; - - reshape_dims_vec.push_back(input_dims[j]); - if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { - reduce_dims_vec.push_back(in_axis); - } - } - - size_t reduce_size = reduce_dims_vec.size(); - size_t reshape_size = reshape_dims_vec.size(); - bool just_copy = (reduce_dims_vec.size() == 0); - output_tensor->mutable_data(context.GetPlace()); - if (just_copy) { - // If this turns out to be a No-Op, simply perform a tensor copy - framework::TensorCopy(*input_tensor, context.GetPlace(), - context.device_context(), output_tensor); - } else { - PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, - platform::errors::InvalidArgument( - "The number of dimensions of the input " - "'Out@GRAD' for Op(broadcast_tensors)" - " must be greater than or equal to 1, but " - "the value received is %d.", - reduce_dims_vec.size())); - PADDLE_ENFORCE_LE( - reduce_dims_vec.size(), 5, - platform::errors::InvalidArgument( - "The number of dimensions of the input 'Out@GRAD' " - "for Op(broadcast_tensors) must be less than or equal " - "to 5, but the value received is %d.", - reduce_dims_vec.size())); - - // Overall: - // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> - // reshape(dX_shape) -> dX - auto dX = framework::EigenVector::Flatten(*output_tensor); - auto dOut = framework::EigenVector::Flatten(*input_tensor); - auto& place = - *context.template device_context().eigen_device(); - - // Expand ReduceSize and ReshapeSize into static values - switch (reduce_size) { - UPPER_SWITCH_REDUCE_DIMS(1) - SWITCH_RESHAPE_DIMS(1) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(2) - SWITCH_RESHAPE_DIMS(2) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(3) - SWITCH_RESHAPE_DIMS(3) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(4) - SWITCH_RESHAPE_DIMS(4) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - UPPER_SWITCH_REDUCE_DIMS(5) - SWITCH_RESHAPE_DIMS(5) - LOWER_SWITCH_REDUCE_DIMS - - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Detected reduce size: %d out of range" - "While maximum supported is: 5", - reduce_size)); - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 7634e5e01aca4..dc5478e8afb98 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -13,11 +13,21 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/infermeta/multiary.h" - +#include #include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors) { + std::vector dims; + dims.reserve(tensors.size()); + for (const MetaTensor* tensor : tensors) { + dims.emplace_back(tensor->dims()); + } + return dims; +} + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -84,6 +94,60 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, out->set_dtype(x.dtype()); } +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out) { + int target_rank = 0; + const auto& input_dims = GetMetaTensorsDim(x); + + // 1. Find Output rank = max(Inputs rank) + for (const auto& input_ddim : input_dims) { + target_rank = std::max(target_rank, input_ddim.size()); + } + + PADDLE_ENFORCE_GT(target_rank, + 0, + errors::InvalidArgument("BroadcastTensorsOp requires at " + "least one input tensor to have " + "rank greater than zero")); + + std::vector target_dims(target_rank, 0); + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + for (int index = 0; index < target_rank; index++) { + // Loop axes in reverse order, + // For each axis, take the maximum as target size + // Fill size = 1 if shape vector exhausts + int target_dim_size = 1; + for (const auto& input_ddim : input_dims) { + // Reversed order + int axis = static_cast(input_ddim.size()) - index - 1; + int dim_size = 1; + if (axis >= 0) { + dim_size = input_ddim[axis]; + } + + if (target_dim_size != 1 && dim_size != 1 && + target_dim_size != dim_size) { + PADDLE_THROW(errors::InvalidArgument( + "BroadcastTensorsOp inputs does not satisfy bcast semantics, " + "please check axis = %d in reverse order", + index)); + } + + // We performed bcast semantics check at python level + // So input tensors should all have legal shape + target_dim_size = std::max(target_dim_size, dim_size); + } + target_dims[target_rank - index - 1] = target_dim_size; + } + + // 3. Set Output Dim + for (size_t i = 0; i < out.size(); i++) { + out[i]->set_dims(phi::make_ddim(target_dims)); + out[i]->share_lod(*(x[i])); + out[i]->set_dtype(x[i]->dtype()); + } +} + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 2afb79daa355c..51738c5e08e98 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,8 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +std::vector GetMetaTensorsDim(const std::vector& tensors); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, @@ -25,6 +27,9 @@ void BilinearTensorProductInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void BroadcastTensorsInferMeta(const std::vector& x, + std::vector out); + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/kernels/broadcast_tensors_grad_kernel.h b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h new file mode 100644 index 0000000000000..5ec2e35cc9b0c --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx); + +} // namespace phi diff --git a/paddle/phi/kernels/broadcast_tensors_kernel.h b/paddle/phi/kernels/broadcast_tensors_kernel.h new file mode 100644 index 0000000000000..fb2a6f1136c26 --- /dev/null +++ b/paddle/phi/kernels/broadcast_tensors_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out); + +} // namespace phi diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index 505d4d3744241..be13e2826ea81 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 44bfae9820aa8..3b3003392d37f 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -50,14 +50,10 @@ DenseTensor Conj(const Context& dev_ctx, const DenseTensor& x) { return x; } -template -void RealKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); - -template -void ImagKernel(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out); +template +void RealKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +template +void ImagKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); } // namespace phi diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc new file mode 100644 index 0000000000000..7a97f8c218973 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_grad_kernel.cc @@ -0,0 +1,201 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_RESHAPE_DIMS(n) \ + case n: { \ + Eigen::DSizes reshape_dims; \ + for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ + reshape_dims[i] = reshape_dims_vec[i]; \ + } \ + dX.device(place) = \ + dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ + break; \ + } + +#define UPPER_SWITCH_REDUCE_DIMS(m) \ + case m: { \ + Eigen::DSizes reduce_dims; \ + for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ + reduce_dims[i] = reduce_dims_vec[i]; \ + } \ + switch (reshape_size) { +#define LOWER_SWITCH_REDUCE_DIMS \ + default: { \ + PADDLE_THROW(errors::InvalidArgument( \ + "Detected reshape size: %d out of range" \ + "Minimum value should be larger than reduce size %d" \ + "While maximum supported is: 5", \ + reshape_size, \ + reduce_size)); \ + } \ + } \ + break; \ + } + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs, but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + const auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes + // Here we perform the following Eigen operations: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + // Note the last "reshape(dX_shape)" will be performed implicitly, + // and we only need to collect reduce_dims and reshape_dims + std::vector reduce_dims_vec; + std::vector reshape_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + reshape_dims_vec.push_back(input_dims[j]); + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + size_t reduce_size = reduce_dims_vec.size(); + size_t reshape_size = reshape_dims_vec.size(); + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // If this turns out to be a No-Op, simply perform a tensor copy + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + PADDLE_ENFORCE_GE( + reduce_dims_vec.size(), + 1, + errors::InvalidArgument("The number of dimensions of the input " + "'Out@GRAD' for Op(broadcast_tensors)" + " must be greater than or equal to 1, but " + "the value received is %d.", + reduce_dims_vec.size())); + PADDLE_ENFORCE_LE( + reduce_dims_vec.size(), + 5, + errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' " + "for Op(broadcast_tensors) must be less than or equal " + "to 5, but the value received is %d.", + reduce_dims_vec.size())); + + // Overall: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + auto dX = EigenVector::Flatten(*output_tensor); + auto dOut = EigenVector::Flatten(*input_tensor); + auto& place = *ctx.eigen_device(); + + // Expand ReduceSize and ReshapeSize into static values + switch (reduce_size) { + UPPER_SWITCH_REDUCE_DIMS(1) + SWITCH_RESHAPE_DIMS(1) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(2) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(3) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(4) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(5) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + default: { + PADDLE_THROW( + errors::InvalidArgument("Detected reduce size: %d out of range" + "While maximum supported is: 5", + reduce_size)); + } + } + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc new file mode 100644 index 0000000000000..4cb6db8769271 --- /dev/null +++ b/paddle/phi/kernels/cpu/broadcast_tensors_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + CPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu new file mode 100644 index 0000000000000..6fb24d72145c6 --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -0,0 +1,111 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_grad_kernel.h" + +#include +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +template +void BroadcastTensorsGradKernel(const Context& ctx, + const std::vector& dout, + std::vector dx) { + // Find reduce dimensions + const auto& in_tensors = dout; + auto& out_tensors = dx; + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + auto* input_tensor = &in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const DDim& input_dims = input_tensor->dims(); + const DDim& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // Collect reduce_dims + // Example: + // dX = [1,1,1,1] + // dOut = [1,1,1,4] + // + // reduce_dims = [3] // reduce along the broadcasted axis + std::vector reduce_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + bool just_copy = (reduce_dims_vec.size() == 0); + ctx.template Alloc(output_tensor); + if (just_copy) { + // Turns out to be a No-Op, simply copy tensors + paddle::framework::TensorCopy( + *input_tensor, ctx.GetPlace(), ctx, output_tensor); + } else { + // reduce_sum implementation on CUDA + kernels::TensorReduceImpl>( + ctx, + *input_tensor, + output_tensor, + kps::IdentityFunctor(), + reduce_dims_vec, + ctx.stream()); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(broadcast_tensors_grad, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsGradKernel, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu new file mode 100644 index 0000000000000..aa45bd3c43891 --- /dev/null +++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" +#include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" + +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(broadcast_tensors, + GPU, + ALL_LAYOUT, + phi::BroadcastTensorsKernel, + bool, + int, + int64_t, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h new file mode 100644 index 0000000000000..eb01b83377cb6 --- /dev/null +++ b/paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h @@ -0,0 +1,118 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/broadcast_tensors_kernel.h" + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(ctx, &in_tensors[i], out_tensors[i]); \ + break; \ + } + +namespace phi { + +template +void ApplyBroadcast(const Context& ctx, + const DenseTensor* input_tensor, + DenseTensor* output_tensor) { + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // 1. Collect bcast_dims, each element of which indicates how many + // times we need to replicate along the corresponding dimension + // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for + // both input and output tensors, so we need to initialize input X with + // expanded dims: "new_input_dims_vec" + Eigen::DSizes bcast_dims; + std::vector new_input_dims_vec(out_rank); + for (int j = 0; j < out_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + bcast_dims[out_axis] = output_dims[out_axis]; + new_input_dims_vec[out_axis] = 1; + if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { + bcast_dims[out_axis] = 1; + new_input_dims_vec[out_axis] = input_dims[in_axis]; + } + } + auto new_input_dims = phi::make_ddim(new_input_dims_vec); + + // Initialize input X with new_input_dims_vec, so it's rank-aligned with the + // output + auto x = EigenTensor::From(*input_tensor, new_input_dims); + + ctx.template Alloc(output_tensor); + auto y = EigenTensor::From(*output_tensor, output_dims); + + auto& place = *ctx.eigen_device(); + funcs::EigenBroadcast, T, OutRank>::Eval( + place, y, x, bcast_dims); +} + +template +void BroadcastTensorsKernel(const Context& ctx, + const std::vector& x, + std::vector out) { + const auto& in_tensors = x; + auto out_tensors = out; + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, + 1, + errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ(num_ins, + out_tensors.size(), + errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and " + "outputs,but received: %d inputs v.s %d outputs", + num_ins, + out_tensors.size())); + + // Eigen has no support for dynamic ranked tensor + // Thus we perform static expansion for each possible ranks + for (size_t i = 0; i < num_ins; i++) { + int out_rank = out_tensors[i]->dims().size(); + switch (out_rank) { + SWITCH_OUT_RANK_CASE(1) + SWITCH_OUT_RANK_CASE(2) + SWITCH_OUT_RANK_CASE(3) + SWITCH_OUT_RANK_CASE(4) + SWITCH_OUT_RANK_CASE(5) + default: { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Target tensor rank out of range" + "Maximum supported rank for broadcast is: 5")); + } + } + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/broadcast_tensors_sig.cc b/paddle/phi/ops/compat/broadcast_tensors_sig.cc new file mode 100644 index 0000000000000..2c979c4aedcc8 --- /dev/null +++ b/paddle/phi/ops/compat/broadcast_tensors_sig.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BroadcastTensorsGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "broadcast_tensors_grad", {GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(broadcast_tensors_grad, + phi::BroadcastTensorsGradOpArgumentMapping); From 7a857924570084851be8b6094f181f217d58fb7c Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Wed, 2 Mar 2022 17:18:53 +0800 Subject: [PATCH 19/41] Move transpose to pten (#39327) * immigrate_transpose_to_pten cpu kernel only; test=develop * fix bug; test=develop * add transpose cuda api * bug fix; * fix bugs * fix bugs; test=develop * bug fix; * move transepose to pten; test=develop * fix bug; test=develop * fix bugs; test=develop * add transpose grad fp16 support; test=develop * fix bug; test=develop * fix npu bug; test=develop * fix nemul = 0 bug; test=develop * add fp16 support; test=develop * fix data type register bug; test=develop * fix transpose bug; test=develop * update transpose * fix transpose bug; test=develop * remove useless code; test=develop * remove useless code; test=develop * fix transpose alias bug; test=develop * polish code; test=develop * resolve confict; test=develop * resolve confilct; test=develop * recover prepared operator; test=develop * fix bug; test=develop * polish code; test=develop * fix bug; test=develop * fix bug; test=develop --- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- paddle/fluid/operators/transpose_op.cc | 60 ++------ paddle/fluid/operators/transpose_op.cu | 139 ------------------ paddle/fluid/operators/transpose_op.cu.h | 42 +++--- paddle/fluid/operators/transpose_op.h | 58 -------- .../fluid/operators/transpose_op_npu_test.cc | 2 +- .../phi/kernels/cpu/transpose_grad_kernel.cc | 32 ++++ paddle/phi/kernels/cpu/transpose_kernel.cc | 80 ++++++++++ paddle/phi/kernels/funcs/math_function.cu | 51 +++++++ .../phi/kernels/gpu/transpose_grad_kernel.cu | 34 +++++ paddle/phi/kernels/gpu/transpose_kernel.cu | 57 +++++++ .../kernels/impl/transpose_grad_kernel_impl.h | 38 +++++ paddle/phi/kernels/transpose_grad_kernel.h | 28 ++++ paddle/phi/kernels/transpose_kernel.h | 28 ++++ paddle/phi/ops/compat/transpose_sig.cc | 38 +++++ .../unittests/parallel_executor_test_base.py | 2 +- ..._imperative_lod_tensor_to_selected_rows.py | 1 + .../test_parallel_executor_transformer.py | 1 + ...test_partial_eager_deletion_transformer.py | 2 + .../tests/unittests/test_transpose_op.py | 1 + 20 files changed, 426 insertions(+), 270 deletions(-) delete mode 100644 paddle/fluid/operators/transpose_op.cu create mode 100644 paddle/phi/kernels/cpu/transpose_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/transpose_kernel.cc create mode 100644 paddle/phi/kernels/gpu/transpose_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/transpose_kernel.cu create mode 100644 paddle/phi/kernels/impl/transpose_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/transpose_grad_kernel.h create mode 100644 paddle/phi/kernels/transpose_kernel.h create mode 100644 paddle/phi/ops/compat/transpose_sig.cc diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 52e2caaeb6ee1..3791fed23a84f 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -29,7 +29,7 @@ USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); -USE_OP(transpose); +USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); namespace paddle { diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 768ab21936f1e..1a297e7238ccd 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -339,6 +339,14 @@ class Transpose2OpGrad : public framework::OperatorWithKernel { } }; +class TransposeGradInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + ctx->SyncTypeAndDataType(framework::GradVarName("Out"), + framework::GradVarName("X")); + } +}; + } // namespace operators } // namespace paddle @@ -347,59 +355,13 @@ REGISTER_OPERATOR( transpose, ops::TransposeOp, ops::TransposeOpMaker, paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); - -REGISTER_OP_CPU_KERNEL( - transpose, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); +REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad, + ops::TransposeGradInferVarType); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker, ops::Transpose2GradMaker); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad, + ops::TransposeGradInferVarType, ops::Transpose2DoubleGradMaker, ops::Transpose2DoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - transpose2, ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel, - ops::TransposeKernel>, - ops::TransposeKernel>, - ops::TransposeKernel); -REGISTER_OP_CPU_KERNEL( - transpose2_grad, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel, - ops::TransposeGradKernel>, - ops::TransposeGradKernel>, - ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu deleted file mode 100644 index 02e224549a5ab..0000000000000 --- a/paddle/fluid/operators/transpose_op.cu +++ /dev/null @@ -1,139 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/transpose_op.cu.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -template -class TransposeGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *x_tensor, axis, out_tensor); - } -}; -template -class TransposeGradGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - if (!x_grad) { - return; - } - - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - const auto& dev_ctx = context.template device_context(); - TransposeGPUKernelDriver(dev_ctx, ndims, *out_grad_tensor, reversed_axis, - x_grad_tensor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - transpose, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - transpose2, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel, - ops::TransposeGPUKernel>, - ops::TransposeGPUKernel>); -REGISTER_OP_CUDA_KERNEL( - transpose2_grad, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel, - ops::TransposeGradGPUKernel>, - ops::TransposeGradGPUKernel>); diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index b542fa37f88fd..a31ac28c9910c 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,8 +16,9 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" namespace paddle { namespace operators { @@ -258,10 +259,10 @@ struct SystemElemType<16> { }; template -void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, - int tile_size_i, int tile_size_j, - int total_tiles_count, const T* input, - const Dim3& input_dims, T* output) { +void LaunchNarrowDims2TransposeKernel(const phi::GPUContext& d, int tile_size_i, + int tile_size_j, int total_tiles_count, + const T* input, const Dim3& input_dims, + T* output) { constexpr int NumThreads = tile_long; if (tile_size_i <= tile_long && tile_size_j <= tile_short) { TilingSwapDim1And2< @@ -278,7 +279,7 @@ void LaunchNarrowDims2TransposeKernel(const platform::CUDADeviceContext& d, template struct NarrowDims2TransposeDispatch { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -319,7 +320,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if< CheckNonLongTileSize(tile_long, tile_short, sizeof(T)), void>::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -351,7 +352,7 @@ struct NarrowDims2TransposeDispatch< T, tile_long, tile_short, typename std::enable_if::type> { - static void DoTranspose(const platform::CUDADeviceContext& d, int tile_size_i, + static void DoTranspose(const phi::GPUContext& d, int tile_size_i, int tile_size_j, int total_tiles_count, const T* input, const Dim3& input_dims, T* output) { PADDLE_ENFORCE_EQ( @@ -368,7 +369,7 @@ struct NarrowDims2TransposeDispatch< }; template -void SwapDim1And2InNarrow(const platform::CUDADeviceContext& d, const T* input, +void SwapDim1And2InNarrow(const phi::GPUContext& d, const T* input, const Dim3& input_dims, T* output, const int kMinTileSize) { // First get available tile sizes for the data type requested as backups @@ -473,9 +474,8 @@ __global__ void TransposeSimpleKernel(int nthreads, const T* __restrict__ input, // Here suppose convert all tensor to dim3, so just change dim1 and 2. template -void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, - const T* input, const Dim3& input_dims, - T* output) { +void SendSwapDim1And2InTranspose(const phi::GPUContext& d, const T* input, + const Dim3& input_dims, T* output) { // Suppose tile size > 16 static const int kMinTileSize = 16; static const int kMinNarrowTileSize = 96; @@ -512,7 +512,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, } else { // If input shape is small, such as 8X8, just do simple copy int total_elements = input_dims[0] * input_dims[1] * input_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_elements); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_elements); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( total_elements, input, input_dims, output); @@ -521,7 +521,7 @@ void SendSwapDim1And2InTranspose(const platform::CUDADeviceContext& d, template struct SwapDim1And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -533,7 +533,7 @@ struct SwapDim1And2InTranspose { template struct SwapDim0And2InTranspose { - typedef platform::CUDADeviceContext Device; + typedef phi::GPUContext Device; void operator()(const Device& d, const T* in, const std::vector& combined_dims, T* out) { Dim3 input_dims = {static_cast(combined_dims[0]), @@ -541,7 +541,7 @@ struct SwapDim0And2InTranspose { static_cast(combined_dims[2])}; size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2]; - auto config = GetGpuLaunchConfig1D(d, total_size); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(d, total_size); TransposeSimpleKernel<<< config.block_per_grid.x, config.thread_per_block.x, 0, d.stream()>>>( @@ -607,7 +607,7 @@ inline void CombineTransposeDim3(const framework::DDim& shape, template struct TransposeSimple { - static bool run(const platform::CUDADeviceContext& ctx, const Tensor& in, + static bool run(const phi::GPUContext& ctx, const Tensor& in, const std::vector perm, Tensor* out) { // First reduce the dimensions of the input tensor if possible. std::vector new_perm; @@ -654,12 +654,12 @@ struct TransposeSimple { }; template -void TransposeGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - const int ndims, const Tensor& in, - const std::vector perm, Tensor* out) { +void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims, + const Tensor& in, + const std::vector& perm, Tensor* out) { auto ret = TransposeSimple::run(dev_ctx, in, perm, out); if (!ret) { - TransCompute(ndims, dev_ctx, in, out, perm); + TransCompute(ndims, dev_ctx, in, out, perm); } } diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h index ec05a534c0ef5..a9e4876cc82a4 100644 --- a/paddle/fluid/operators/transpose_op.h +++ b/paddle/fluid/operators/transpose_op.h @@ -59,63 +59,5 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx, } } -template -class TransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.InputVar("X"); - auto* out = context.OutputVar("Out"); - - const framework::Tensor* x_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*x); - framework::Tensor* out_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(out); - - out_tensor->mutable_data(context.GetPlace()); - if (out_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *x_tensor, out_tensor, axis); - } -}; - -template -class TransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out_grad = context.InputVar(framework::GradVarName("Out")); - auto* x_grad = context.OutputVar(framework::GradVarName("X")); - - if (!x_grad) { - return; - } - const framework::Tensor* out_grad_tensor = - GetLoDTensorOrSelectedRowsValueFromVar(*out_grad); - framework::Tensor* x_grad_tensor = - GetMutableLoDTensorOrSelectedRowsValueFromVar(x_grad); - - x_grad_tensor->mutable_data(context.GetPlace()); - if (x_grad_tensor->numel() == 0) { - return; - } - - std::vector axis = context.Attr>("axis"); - std::vector reversed_axis(axis); - - for (size_t i = 0; i < axis.size(); i++) { - reversed_axis[axis[i]] = i; - } - - int ndims = axis.size(); - auto& dev_ctx = context.template device_context(); - TransCompute(ndims, dev_ctx, *out_grad_tensor, - x_grad_tensor, reversed_axis); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index cce3f188c8b74..5617d728a51dc 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -31,7 +31,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(transpose2); +USE_OP_ITSELF(transpose2); USE_OP_DEVICE_KERNEL(transpose2, NPU); template diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc new file mode 100644 index 0000000000000..9dbcf575f33c1 --- /dev/null +++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/transpose_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(transpose_grad, + CPU, + ALL_LAYOUT, + phi::TransposeGradKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc new file mode 100644 index 0000000000000..a80196e7f80e1 --- /dev/null +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/transpose_kernel.h" +#include +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +namespace phi { + +template +void TransposeKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + int rank = axis.size(); + switch (rank) { + case 1: + funcs::Transpose trans1; + trans1(ctx, x, out, axis); + break; + case 2: + funcs::Transpose trans2; + trans2(ctx, x, out, axis); + break; + case 3: + funcs::Transpose trans3; + trans3(ctx, x, out, axis); + break; + case 4: + funcs::Transpose trans4; + trans4(ctx, x, out, axis); + break; + case 5: + funcs::Transpose trans5; + trans5(ctx, x, out, axis); + break; + case 6: + funcs::Transpose trans6; + trans6(ctx, x, out, axis); + break; + default: + // for rank >= 7 situation + funcs::TransposeNormal trans_normal; + trans_normal(ctx, x, out, axis); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(transpose, + CPU, + ALL_LAYOUT, + phi::TransposeKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index ae368a005f057..df2af82d551ee 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -187,6 +187,57 @@ void TransposeNormal::operator()( in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr, rank); } +template +struct TransposeNormal { + void operator()(const phi::GPUContext& context, + const DenseTensor& in, + DenseTensor* out, + const std::vector& axis) { + const int rank = axis.size(); + auto in_stride = stride(in.dims()); + auto out_stride = stride(out->dims()); + auto* in_ptr = in.data(); + auto* out_ptr = out->data(); + + // copy in_stride, out_stride, axis to gpu device + const phi::GPUPlace& cuda_place = context.GetPlace(); + phi::CPUPlace cpu_place = paddle::platform::CPUPlace(); + size_t size = 3 * rank * sizeof(int64_t); + auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size); + auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size); + REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr()); + REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr()); + for (int i = 0; i < rank; ++i) { + cpu_buf[i] = in_stride[i]; + cpu_buf[rank + i] = out_stride[i]; + cpu_buf[2 * rank + i] = axis[i]; + } + paddle::memory::Copy( + cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream()); + REINTERPRET(const int64_t, in_stride_ptr, cuda_buf); + REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank); + REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank); + + const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock(); + const int MAX_GRID_DIM = + context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM; + int64_t elements = in.numel(); + int block_size = (elements >= MAX_BLOCK_DIM) + ? MAX_BLOCK_DIM + : (1 << static_cast(std::log2(elements))); + int grid_size = elements / block_size; + grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size; + TransposeNormalKernel<<>>( + in_ptr, + out_ptr, + elements, + in_stride_ptr, + out_stride_ptr, + axis_ptr, + rank); + } +}; + // define transpose normal #define DEFINE_GPU_TRANS_NORMAL(TYPE) \ template struct TransposeNormal; \ diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu new file mode 100644 index 0000000000000..0687dc0c200a8 --- /dev/null +++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" +#include "paddle/phi/kernels/transpose_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(transpose_grad, + GPU, + ALL_LAYOUT, + phi::TransposeGradKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/transpose_kernel.cu b/paddle/phi/kernels/gpu/transpose_kernel.cu new file mode 100644 index 0000000000000..9ea2af292ccf1 --- /dev/null +++ b/paddle/phi/kernels/gpu/transpose_kernel.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/api/ext/dispatch.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +#include "paddle/fluid/framework/gpu_utils.h" +#include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h" + +namespace phi { +template +void TransposeKernel(const Context& ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + int rank = axis.size(); + ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + paddle::operators::TransposeGPUKernelDriver(ctx, rank, x, axis, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(transpose, + GPU, + ALL_LAYOUT, + phi::TransposeKernel, + bool, + float, + double, + int32_t, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h new file mode 100644 index 0000000000000..6bb555fe28f11 --- /dev/null +++ b/paddle/phi/kernels/impl/transpose_grad_kernel_impl.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/transpose_grad_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { + +template +void TransposeGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const std::vector& axis, + DenseTensor* x_grad) { + std::vector reversed_axis(axis); + + dev_ctx.template Alloc(x_grad); + for (size_t i = 0; i < axis.size(); i++) { + reversed_axis[axis[i]] = i; + } + + TransposeKernel(dev_ctx, out_grad, reversed_axis, x_grad); +} + +} // namespace phi diff --git a/paddle/phi/kernels/transpose_grad_kernel.h b/paddle/phi/kernels/transpose_grad_kernel.h new file mode 100644 index 0000000000000..33d4ca7e3c6c2 --- /dev/null +++ b/paddle/phi/kernels/transpose_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TransposeGradKernel(const Context& dev_ctx, + const DenseTensor& out_grad, + const std::vector& axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h new file mode 100644 index 0000000000000..303b4a9a8f05d --- /dev/null +++ b/paddle/phi/kernels/transpose_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TransposeKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/transpose_sig.cc b/paddle/phi/ops/compat/transpose_sig.cc new file mode 100644 index 0000000000000..90961760cfc66 --- /dev/null +++ b/paddle/phi/ops/compat/transpose_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TransposeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("transpose", {"X"}, {"axis"}, {"Out"}); +} + +KernelSignature TransposeGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "transpose_grad", {GradVarName("Out")}, {"axis"}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(transpose2, transpose); +PD_REGISTER_BASE_KERNEL_NAME(transpose2_grad, transpose_grad); + +PD_REGISTER_ARG_MAPPING_FN(transpose2, phi::TransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose2_grad, + phi::TransposeGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose, phi::TransposeOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(transpose_grad, phi::TransposeGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 2a8f72c217055..2633a5992563f 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,7 +43,7 @@ def check_network_convergence(cls, get_data_from_feeder=None, use_parallel_executor=True, use_reduce=False, - use_ir_memory_optimize=True, + use_ir_memory_optimize=False, enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_all_optimizer_ops=False, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index d54194164a559..110bb961bbe12 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -207,4 +207,5 @@ def simple_net_float32(self, is_sparse, dtype): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 1cb39eb131b82..b87e8d4e3c21a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -206,4 +206,5 @@ def test_main(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py index 1661f753a8464..15d9e0e2daa5e 100644 --- a/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py @@ -14,10 +14,12 @@ import unittest import paddle.fluid as fluid +import paddle fluid.core._set_eager_deletion_mode(0.0, 0.55, True) from test_parallel_executor_transformer import TestTransformer if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index 13b880b28bf85..1e6b4354dd9c8 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -463,4 +463,5 @@ def test_error(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() From 66196573ffe73bd3e02a4f713e2b2578bbf601aa Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 2 Mar 2022 17:50:32 +0800 Subject: [PATCH 20/41] [XPU] Fix Phi Kernel cache problem in operator.cc (#40044) * [XPU] Fix Phi Kernel cache problem in operator.cc * fix typo --- paddle/fluid/framework/operator.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b91ee3c2d633d..ffdc3e6d3c2bc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1210,6 +1210,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } + } else { + pt_kernel_name = pt_kernel_signature_->name; + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); } #ifdef PADDLE_WITH_XPU bool is_xpu_unsupport = From 5898e9abecc05bc039e29838ec4b8fb49ae2d3f0 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 2 Mar 2022 18:25:54 +0800 Subject: [PATCH 21/41] [Phi]Move elementwise function to funcs directory (#39986) * move elementwise function to funcs directory * fix compile bugs * modify according to comment --- .../elementwise/elementwise_add_op.kps | 2 +- .../elementwise/elementwise_op_broadcast.cu.h | 3 - .../elementwise/elementwise_op_function.h | 29 +- .../elementwise/elementwise_op_impl.cu.h | 2 +- paddle/fluid/operators/viterbi_decode_op.h | 12 +- paddle/phi/kernels/cpu/elementwise.h | 619 +---------------- paddle/phi/kernels/cpu/elementwise_grad.h | 146 ++++ .../kernels/cpu/elementwise_grad_kernel.cc | 27 +- paddle/phi/kernels/cpu/logical_kernel.cc | 20 +- paddle/phi/kernels/cpu/math_kernel.cc | 9 +- paddle/phi/kernels/funcs/broadcast_function.h | 18 +- paddle/phi/kernels/funcs/elementwise_base.h | 285 ++++---- .../elementwise_grad_base.h} | 655 +++++++++++------- paddle/phi/kernels/funcs/elementwise_utils.h | 121 ++++ paddle/phi/kernels/gpu/elementwise_grad.h | 246 +++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 27 +- paddle/phi/kernels/gpu/logical_kernel.cu | 3 +- paddle/phi/kernels/gpu/math_kernel.cu | 2 +- .../impl/elementwise_grad_kernel_impl.h | 33 +- 19 files changed, 1149 insertions(+), 1110 deletions(-) create mode 100644 paddle/phi/kernels/cpu/elementwise_grad.h rename paddle/phi/kernels/{gpu/elementwise.h => funcs/elementwise_grad_base.h} (78%) create mode 100644 paddle/phi/kernels/funcs/elementwise_utils.h create mode 100644 paddle/phi/kernels/gpu/elementwise_grad.h diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index d6e0749318e90..3b7457d72e15d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -39,7 +39,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #else #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/phi/kernels/gpu/elementwise.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 418779c32e8bc..102127e6ffe4e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -16,9 +16,6 @@ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -// only can include the headers in paddle/top/api dirs -#include "paddle/phi/kernels/gpu/elementwise.h" - namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index a1a7f83109866..61862aa9f8740 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/cpu/elementwise_grad.h" #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ @@ -133,7 +134,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - return phi::funcs::trim_trailing_singular_dims(dims); + return phi::funcs::TrimTrailingSingularDims(dims); } template ( dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } else { - phi::ElemwiseGradComputeWithBroadcast( + phi::funcs::ElemwiseGradComputeWithBroadcast( dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } } @@ -173,19 +174,9 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, const framework::Tensor *y, int axis, Functor func, framework::Tensor *z) { z->mutable_data(ctx.GetPlace()); - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(__NVCC__) || defined(__HIPCC__) - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, - z); - -#endif - return; - } - const auto &dev_ctx = - ctx.template device_context(); - phi::ElementwiseCompute(dev_ctx, *x, *y, axis, func, z); + const auto &dev_ctx = ctx.template device_context(); + phi::funcs::ElementwiseCompute(dev_ctx, *x, *y, axis, + func, z); } // FusedElemwiseAndAct @@ -443,8 +434,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -991,8 +982,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - phi::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + phi::funcs::GetMidDims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 7d7bb4f26fcf4..f49e2ab4e173e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -19,7 +19,7 @@ limitations under the License. */ // only can include the headers in paddle/top/api dirs #include "paddle/phi/api/lib/utils/tensor_utils.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h index 8f01a0c36043b..bf12a03e7b4dc 100644 --- a/paddle/fluid/operators/viterbi_decode_op.h +++ b/paddle/fluid/operators/viterbi_decode_op.h @@ -151,12 +151,12 @@ struct GetInputIndex { const std::vector& output_strides, int output_idx, int* index_array, int* lhs_idx, int* rhs_idx) { int out_dims_size = output_strides.size(); - *lhs_idx = - phi::GetElementwiseIndex(lhs_dims.data(), out_dims_size, index_array); - *rhs_idx = - phi::GetElementwiseIndex(rhs_dims.data(), out_dims_size, index_array); - phi::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size, - index_array); + *lhs_idx = phi::funcs::GetElementwiseIndex(lhs_dims.data(), out_dims_size, + index_array); + *rhs_idx = phi::funcs::GetElementwiseIndex(rhs_dims.data(), out_dims_size, + index_array); + phi::funcs::UpdateElementwiseIndexArray(output_dims.data(), out_dims_size, + index_array); } }; diff --git a/paddle/phi/kernels/cpu/elementwise.h b/paddle/phi/kernels/cpu/elementwise.h index 28bf5ab743f6d..0f67df661136d 100644 --- a/paddle/phi/kernels/cpu/elementwise.h +++ b/paddle/phi/kernels/cpu/elementwise.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -189,250 +189,6 @@ struct SameDimsMultiplyFunctor< } }; -inline void UpdateElementwiseIndexArray(const int* out_dims_array, - const int max_dim, - int* index_array) { - for (int i = max_dim - 1; i >= 0; --i) { - ++index_array[i]; - if (index_array[i] >= out_dims_array[i]) { - index_array[i] -= out_dims_array[i]; - } else { - break; - } - } -} - -inline int GetElementwiseIndex(const int* x_dims_array, - const int max_dim, - const int* index_array) { - int index_ = 0; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] > 1) { - index_ = index_ * x_dims_array[i] + index_array[i]; - } - } - return index_; -} - -template -void CommonGradBroadcastCPU(const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int* x_dims_array, - int* y_dims_array, - int* out_dims_array, - int max_dim, - const CPUContext& ctx, - DX_OP dx_op, - DY_OP dy_op) { - std::vector index_array(max_dim, 0); - const T* x_data = x.data(); - const T* y_data = y.data(); - const Tout* out_data = out.data(); - const Tout* dout_data = dout.data(); - T* dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); - T* dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); - if (dx_data != nullptr) { - memset(dx_data, 0, dx->numel() * sizeof(T)); - } - if (dy_data != nullptr) { - memset(dy_data, 0, dy->numel() * sizeof(T)); - } - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (dx_data != nullptr) { - dx_data[x_index] += dx_op(x_data[x_index], - y_data[y_index], - out_data[out_index], - dout_data[out_index]); - } - if (dy_data != nullptr) { - dy_data[y_index] += dy_op(x_data[x_index], - y_data[y_index], - out_data[out_index], - dout_data[out_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonForwardBroadcastCPU(const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z, - int* x_dims_array, - int* y_dims_array, - int* out_dims_array, - int max_dim, - const CPUContext& ctx, - Functor func, - const bool is_xsize_larger = true) { - std::vector index_array(max_dim, 0); - const T* x_data = x.data(); - const T* y_data = y.data(); - PADDLE_ENFORCE_NOT_NULL( - x_data, phi::errors::InvalidArgument("The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL( - y_data, phi::errors::InvalidArgument("The input Y should not be empty.")); - OutType* out_data = ctx.Alloc(z); - - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (is_xsize_larger) { - out_data[out_index] = func(x_data[x_index], y_data[y_index]); - } else { - out_data[out_index] = func(y_data[y_index], x_data[x_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonElementwiseBroadcastForward(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z, - const DDim& x_dims, - const DDim& y_dims, - Functor func, - int axis, - const bool is_xsize_larger = true) { - int max_dim = (std::max)(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - - CommonForwardBroadcastCPU(x, - y, - z, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - dev_ctx, - func, - is_xsize_larger); -} - -// It is a common CPU implementation to compute binary calculation with the -// support of broadcast. Note: -// 1. CPU implementation cannot support the case when x needs broadcast, thus -// this function need to be called with XxxFunctor and XxxInverseFunctor, -// like AddFunctor and InverseAddFunctor. -// 2. The corresponding GPU implementation supports all the broadcast cases, -// thus there is no need to define and call with XxxInverseFunctor. -// TODO(liuyiqun): optimize the CPU implementation to support all broadcast -// cases and avoid the need of XxxInverseFunctor. -template -void ElementwiseCompute(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - Functor func, - DenseTensor* z) { - dev_ctx.Alloc(z); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - funcs::TransformFunctor functor( - x, y, z, dev_ctx, func, is_xsize_larger); - if (x_dims == y_dims) { - functor.Run(); - return; - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common implementation. - // case 1: x=[2,3,1,5], y=[2,1,4,1] - // case 2: x=[2,3,4], y=[1,1,4] - if (is_run_common_broadcast == 1) { - CommonElementwiseBroadcastForward( - dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); - return; - } - - if (post == 1) { - functor.RunRowWise(n, pre); - return; - } else { - functor.RunMidWise(n, pre, post); - return; - } -} - template struct SameDimsElementwiseCompute { void operator()(const CPUContext& dev_ctx, @@ -443,377 +199,4 @@ struct SameDimsElementwiseCompute { } }; -// BACKWARD CODE - -template -static void ElemwiseGradBroadcast1CPU(const T* x, - const T* y, - const Tout* out, - const Tout* dout, - int h, - int w, - bool is_xsize_larger, - DX_OP dx_op, - DY_OP dy_op, - T* dx, - T* dy) { - if (is_xsize_larger) { - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int x_offset = i * w + j; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } - } - } - } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < h; ++i) { - for (int j = 0; j < w; ++j) { - int y_offset = i * w + j; - if (dy != nullptr) { - dy[y_offset] = - dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - } - if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } - } - } - } - } -} - -template -static void ElemwiseGradBroadcast2CPU(const T* x, - const T* y, - const Tout* out, - const Tout* dout, - int pre, - int n, - int post, - bool is_xsize_larger, - DX_OP dx_op, - DY_OP dy_op, - T* dx, - T* dy) { - if (is_xsize_larger) { - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int x_offset = i * n * post + j * post + k; - if (dx != nullptr) { - dx[x_offset] = - dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - } - if (dy != nullptr) { - T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); - if (i == 0 && k == 0) { - dy[j] = tmp; - } else { - dy[j] += tmp; - } - } - } - } - } - } else { // x.dims < y.dims, broadcast for x. - for (int i = 0; i < pre; ++i) { - for (int j = 0; j < n; ++j) { - for (int k = 0; k < post; ++k) { - int y_offset = i * n * post + j * post + k; - if (dy != nullptr) { - dy[y_offset] = - dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - } - if (dx != nullptr) { - T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); - if (i == 0 && k == 0) { - dx[j] = tmp; - } else { - dx[j] += tmp; - } - } - } - } - } - } -} - -template -void CommonElementwiseBroadcastBackward(const CPUContext& ctx, - const DDim& x_dims, - const DDim& y_dims, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - int max_dim = std::max(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - // for inplace strategy. memset will make dx and dout clear and get wrong - // result. - if (dx && dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x_dims, ctx.GetPlace()); - } - - VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" - << phi::make_ddim(x_dims_array) - << " ydim:" << phi::make_ddim(y_dims_array); - - CommonGradBroadcastCPU(x, - y, - out, - dout, - dx, - dy, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - ctx, - dx_op, - dy_op); -} - -template -void ElemwiseGradComputeWithBroadcast(const CPUContext& ctx, - const DDim& x_dims, - const DDim& y_dims, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - bool is_xsize_larger = true; - - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - phi::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - phi::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common backward implementation. - if (is_run_common_broadcast) { - CommonElementwiseBroadcastBackward( - ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - return; - } - if (post == 1) { - ElemwiseGradBroadcast1CPU(x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : ctx.Alloc(dx), - dy == nullptr ? nullptr : ctx.Alloc(dy)); - } else { - ElemwiseGradBroadcast2CPU(x.data(), - y.data(), - out.data(), - dout.data(), - pre, - n, - post, - is_xsize_larger, - dx_op, - dy_op, - dx == nullptr ? nullptr : ctx.Alloc(dx), - dy == nullptr ? nullptr : ctx.Alloc(dy)); - } -} - -// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. -// explicit gradient can cut off X, Y, Out from gradient op -// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse -// elementwise code. -template -void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - int axis, - DenseTensor* dx, - DenseTensor* dy, - DX_OP dx_op, - DY_OP dy_op) { - const DDim& x_dim = x.dims(); - const DDim& y_dim = y.dims(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, - x_dim, - y_dim, - dout, - dout, - out, - dout, - axis, - dx, - dy, - dx_op, - dy_op); - } else { - ElemwiseGradComputeWithBroadcast(dev_ctx, - x_dim, - y_dim, - dout, - dout, - out, - dout, - axis, - dx, - dy, - dx_op, - dy_op); - } -} - -/* -****************************** - Add Grad -****************************** -*/ -template -struct IdentityGrad { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } -}; - -template -typename std::enable_if::value>::type -elementwise_add_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - auto blas = phi::funcs::GetBlas(ctx); - if (dx) { - blas.VCOPY( - dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); - } - - if (dy) { - blas.VCOPY( - dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); - } -} - -template -typename std::enable_if::value>::type -elementwise_add_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - ElemwiseExplicitGradCompute, IdentityGrad>( - ctx, x, y, out, dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); -} - -/* -****************************** - Sub Grad -****************************** -*/ - -template -struct SubGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } -}; - -template -struct SubGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } -}; - -template -void elementwise_sub_grad(const CPUContext& ctx, - const DenseTensor& x, - const DenseTensor& y, - const DenseTensor& out, - const DenseTensor& dout, - DenseTensor* dx, - DenseTensor* dy, - int axis = -1) { - ElemwiseExplicitGradCompute, SubGradDY>( - ctx, x, y, out, dout, axis, dx, dy, SubGradDX(), SubGradDY()); -} - } // namespace phi diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h new file mode 100644 index 0000000000000..92587566eb875 --- /dev/null +++ b/paddle/phi/kernels/cpu/elementwise_grad.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/elementwise_grad_base.h" + +namespace phi { + +// NOTE(dzhwinter): Only used in elementwise_add, elementwise_sub. +// explicit gradient can cut off X, Y, Out from gradient op +// In elementwise_add, elementwise_sub, we use dout as fake X, Y, Out to reuse +// elementwise code. +template +void ElemwiseExplicitGradCompute(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy, + DX_OP dx_op, + DY_OP dy_op) { + const DDim& x_dim = x.dims(); + const DDim& y_dim = y.dims(); + if (x.dims() == y.dims()) { + funcs::ElemwiseGradComputeNoBroadcast(dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } else { + funcs::ElemwiseGradComputeWithBroadcast(dev_ctx, + x_dim, + y_dim, + dout, + dout, + out, + dout, + axis, + dx, + dy, + dx_op, + dy_op); + } +} + +/* +****************************** + Add Grad +****************************** +*/ +template +struct IdentityGrad { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } +}; + +template +typename std::enable_if::value>::type +ElementwiseAddGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + auto blas = phi::funcs::GetBlas(ctx); + if (dx) { + blas.VCOPY( + dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); + } + + if (dy) { + blas.VCOPY( + dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); + } +} + +template +typename std::enable_if::value>::type +ElementwiseAddGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + ElemwiseExplicitGradCompute, IdentityGrad>( + ctx, x, y, out, dout, axis, dx, dy, IdentityGrad(), IdentityGrad()); +} + +/* +****************************** + Sub Grad +****************************** +*/ + +template +struct SubGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } +}; + +template +struct SubGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } +}; + +template +void ElementwiseSubGrad(const CPUContext& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + int axis = -1) { + ElemwiseExplicitGradCompute, SubGradDY>( + ctx, x, y, out, dout, axis, dx, dy, SubGradDX(), SubGradDY()); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index c878e8133ffc0..e48ee80595908 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -17,7 +17,8 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/cpu/elementwise_grad.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -33,7 +34,7 @@ void AddGradFunc(const CPUContext& dev_ctx, DenseTensor* dy, int axis = -1) { if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { - elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + ElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy); } else { ElemwiseExplicitGradCompute, IdentityGrad>( dev_ctx, @@ -68,15 +69,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>, - ElementwiseCompute, T>); + phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -101,7 +94,7 @@ void SubtractGradKernel(const Context& dev_ctx, DenseTensor* dy) { // skip out auto* out = &dout; - elementwise_sub_grad(dev_ctx, x, y, *out, dout, dx, dy, axis); + ElementwiseSubGrad(dev_ctx, x, y, *out, dout, dx, dy, axis); } template @@ -112,15 +105,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl( - dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>); + phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } } // namespace phi diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc index 3d179e1e75f4f..a0747b128e538 100644 --- a/paddle/phi/kernels/cpu/logical_kernel.cc +++ b/paddle/phi/kernels/cpu/logical_kernel.cc @@ -16,7 +16,7 @@ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/cpu/elementwise.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/logical_functor.h" // See Note [ Why still include the fluid headers? ] @@ -24,15 +24,15 @@ namespace phi { -#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ - template \ - void Logical##type##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - DenseTensor* out) { \ - funcs::Logical##type##Functor binary_func; \ - ElementwiseCompute, T, bool>( \ - dev_ctx, x, y, -1, binary_func, out); \ +#define DEFINE_LOGICAL_BINARY_KERNEL(type) \ + template \ + void Logical##type##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + DenseTensor* out) { \ + funcs::Logical##type##Functor binary_func; \ + funcs::ElementwiseCompute, T, bool>( \ + dev_ctx, x, y, -1, binary_func, out); \ } DEFINE_LOGICAL_BINARY_KERNEL(And) diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 5cfcfe62c7816..250f656926c05 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -20,6 +20,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/elementwise.h" #include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/funcs/reduce_functor.h" @@ -45,10 +46,10 @@ namespace phi { auto x_dims = x.dims(); \ auto y_dims = y.dims(); \ if (x_dims.size() >= y_dims.size()) { \ - ElementwiseCompute, T>( \ + funcs::ElementwiseCompute, T>( \ dev_ctx, x, y, axis, funcs::name##Functor(), out); \ } else { \ - ElementwiseCompute, T>( \ + funcs::ElementwiseCompute, T>( \ dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ } \ } \ @@ -93,10 +94,10 @@ void DivideRawKernel(const Context& dev_ctx, auto x_dims = x.dims(); auto y_dims = y.dims(); if (x_dims.size() >= y_dims.size()) { - ElementwiseCompute, T>( + funcs::ElementwiseCompute, T>( dev_ctx, x, y, axis, funcs::DivideFunctor(), out); } else { - ElementwiseCompute, T>( + funcs::ElementwiseCompute, T>( dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); } } diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index 84a36b849afa1..e9fd4cf47b834 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -25,6 +25,8 @@ namespace kps = phi::kps; namespace phi { namespace funcs { +#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) + struct DimensionsTransform { using DimVector = std::vector; typedef void (*MergeFunctor)( @@ -183,8 +185,6 @@ struct DimensionsTransform { } }; -#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) - template __device__ __forceinline__ void LoadData( T *dst, @@ -578,6 +578,20 @@ void BroadcastKernel(const KPDevice &ctx, } } +template +void ElementwiseCompute(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + std::vector ins = {&x, &y}; + std::vector outs = {z}; + z->mutable_data(dev_ctx.GetPlace()); + BroadcastKernel( + dev_ctx, ins, &outs, axis, func); +} + #endif } // namespace funcs diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index d369781f845eb..235dbdd40f6b7 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -18,7 +18,8 @@ limitations under the License. */ #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" -#include "paddle/phi/kernels/funcs/for_range.h" +#include "paddle/phi/kernels/funcs/common_shape.h" +#include "paddle/phi/kernels/funcs/elementwise_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -44,28 +45,6 @@ using ConditionalT = namespace funcs { using DDim = phi::DDim; -template -struct ElemwiseGradNoBroadcast { - const T *x_; - const T *y_; - const Tout *out_; - const Tout *dout_; - - HOSTDEVICE void operator()(size_t i) { - if (dx_ != nullptr) { - dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); - } - if (dy_ != nullptr) { - dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]); - } - } - - DX_OP dx_op_; - DY_OP dy_op_; - T *dx_; - T *dy_; -}; - template class RowwiseTransformIterator; @@ -293,73 +272,172 @@ class TransformFunctor { bool is_xsize_larger_; }; -inline DDim trim_trailing_singular_dims(const DDim &dims) { - // Remove trailing dimensions of size 1 for y - auto actual_dims_size = dims.size(); - for (; actual_dims_size != 0; --actual_dims_size) { - if (dims[actual_dims_size - 1] != 1) break; - } - if (actual_dims_size == dims.size()) return dims; - std::vector trim_dims; - trim_dims.resize(actual_dims_size); - for (int i = 0; i < actual_dims_size; ++i) { - trim_dims[i] = dims[i]; - } - if (trim_dims.size() == 0) { - return DDim(phi::make_dim()); +template +void CommonForwardBroadcastCPU(const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const CPUContext &ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + PADDLE_ENFORCE_NOT_NULL( + x_data, errors::InvalidArgument("The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL( + y_data, errors::InvalidArgument("The input Y should not be empty.")); + OutType *out_data = ctx.Alloc(z); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); } - DDim actual_dims = phi::make_ddim(trim_dims); - return actual_dims; } -/* - * Out = X ⊙ Y - * If Y's shape does not match X' shape, they will be reshaped. - * For example: - * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 - * pre=2, n=3*4, post=5 - * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) - * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) - * pre=2*3, n=4*5, post=1 - * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) - * - * New parameter: *is_run_common_broadcast* is a flag to record whether to run - * common broadcast code. - */ -inline void get_mid_dims(const DDim &x_dims, - const DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post, - int *is_run_common_broadcast) { - *pre = 1; - *n = 1; - *post = 1; - *is_run_common_broadcast = 0; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - for (int i = 0; i < y_dims.size(); ++i) { - if (x_dims[i + axis] != y_dims[i]) { - PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, - true, - phi::errors::InvalidArgument( - "Broadcast dimension mismatch. Operands " - "could not be broadcast together with the shape of " - "X = [%s] and the shape of Y = [%s]. Received [%d] " - "in X is not equal to [%d] in Y.", - x_dims, - y_dims, - x_dims[i + axis], - y_dims[i])); - *is_run_common_broadcast = 1; - return; - } - (*n) *= y_dims[i]; - } - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; +template +void CommonElementwiseBroadcastForward(const CPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + const DDim &x_dims, + const DDim &y_dims, + Functor func, + int axis, + const bool is_xsize_larger = true) { + int max_dim = (std::max)(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + phi::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + phi::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + CommonForwardBroadcastCPU(x, + y, + z, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dev_ctx, + func, + is_xsize_larger); +} + +// It is a common CPU implementation to compute binary calculation with the +// support of broadcast. Note: +// 1. CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like AddFunctor and InverseAddFunctor. +// 2. The corresponding GPU implementation supports all the broadcast cases, +// thus there is no need to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. +template +void ElementwiseCompute(const CPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + int axis, + Functor func, + DenseTensor *z) { + dev_ctx.Alloc(z); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + TransformFunctor functor( + x, y, z, dev_ctx, func, is_xsize_larger); + if (x_dims == y_dims) { + functor.Run(); + return; + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); + return; + } + + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; } } @@ -395,41 +473,11 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx, auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout()); *ddx_safe = phi::Empty(dev_ctx, std::move(meta)); ddx_safe->mutable_data(dev_ctx.GetPlace()); - phi::funcs::SetConstant set_zero; + SetConstant set_zero; set_zero(dev_ctx, ddx_safe, static_cast(0)); } } -template -void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, - const DDim &x_dim, - const DDim &y_dim, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - int axis, - DenseTensor *dx, - DenseTensor *dy, - DX_OP dx_op, - DY_OP dy_op) { - size_t N = static_cast(phi::product(x_dim)); - phi::funcs::ForRange for_range(dev_ctx, N); - for_range(ElemwiseGradNoBroadcast{ - x.data(), - y.data(), - out.data(), - dout.data(), - dx_op, - dy_op, - dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), - dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); -} - inline void ElementwiseGradPreProcess(const DenseTensor &dout, DenseTensor *dx) { if (dx != nullptr) { @@ -806,6 +854,7 @@ void ElementwiseKernel(const KPDevice &ctx, } } } + #endif } // namespace funcs diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h similarity index 78% rename from paddle/phi/kernels/gpu/elementwise.h rename to paddle/phi/kernels/funcs/elementwise_grad_base.h index 12cafc7023bb5..dff0cfe5b8b90 100644 --- a/paddle/phi/kernels/gpu/elementwise.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -14,16 +14,25 @@ limitations under the License. */ #pragma once -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/common_shape.h" -#include "paddle/phi/kernels/gpu/reduce.h" +#include "paddle/phi/kernels/funcs/elementwise_utils.h" +#include "paddle/phi/kernels/funcs/for_range.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +#endif #ifdef __HIPCC__ constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; #else constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; #endif + #define BLOCK_X 32 #define BLOCK_Y 32 @@ -36,21 +45,361 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; namespace phi { -// General binary elementwise comutaion with the support of broadcast. -template -void ElementwiseCompute(const GPUContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - int axis, - Functor func, - DenseTensor *z) { - std::vector ins = {&x, &y}; - std::vector outs = {z}; - z->mutable_data(dev_ctx.GetPlace()); - phi::funcs::BroadcastKernel( - dev_ctx, ins, &outs, axis, func); +namespace funcs { +using DDim = phi::DDim; + +template +void CommonGradBroadcastCPU(const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int *x_dims_array, + int *y_dims_array, + int *out_dims_array, + int max_dim, + const CPUContext &ctx, + DX_OP dx_op, + DY_OP dy_op) { + std::vector index_array(max_dim, 0); + const T *x_data = x.data(); + const T *y_data = y.data(); + const Tout *out_data = out.data(); + const Tout *dout_data = dout.data(); + T *dx_data = dx == nullptr ? nullptr : ctx.Alloc(dx); + T *dy_data = dy == nullptr ? nullptr : ctx.Alloc(dy); + if (dx_data != nullptr) { + memset(dx_data, 0, dx->numel() * sizeof(T)); + } + if (dy_data != nullptr) { + memset(dy_data, 0, dy->numel() * sizeof(T)); + } + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (dx_data != nullptr) { + dx_data[x_index] += dx_op(x_data[x_index], + y_data[y_index], + out_data[out_index], + dout_data[out_index]); + } + if (dy_data != nullptr) { + dy_data[y_index] += dy_op(x_data[x_index], + y_data[y_index], + out_data[out_index], + dout_data[out_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +template +static void ElemwiseGradBroadcast1CPU(const T *x, + const T *y, + const Tout *out, + const Tout *dout, + int h, + int w, + bool is_xsize_larger, + DX_OP dx_op, + DY_OP dy_op, + T *dx, + T *dy) { + if (is_xsize_larger) { + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int x_offset = i * w + j; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } + } + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + int y_offset = i * w + j; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } + } + } + } + } +} + +template +static void ElemwiseGradBroadcast2CPU(const T *x, + const T *y, + const Tout *out, + const Tout *dout, + int pre, + int n, + int post, + bool is_xsize_larger, + DX_OP dx_op, + DY_OP dy_op, + T *dx, + T *dy) { + if (is_xsize_larger) { + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int x_offset = i * n * post + j * post + k; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = dy_op(x[x_offset], y[j], out[x_offset], dout[x_offset]); + if (i == 0 && k == 0) { + dy[j] = tmp; + } else { + dy[j] += tmp; + } + } + } + } + } + } else { // x.dims < y.dims, broadcast for x. + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int y_offset = i * n * post + j * post + k; + if (dy != nullptr) { + dy[y_offset] = + dy_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + } + if (dx != nullptr) { + T tmp = dx_op(x[j], y[y_offset], out[y_offset], dout[y_offset]); + if (i == 0 && k == 0) { + dx[j] = tmp; + } else { + dx[j] += tmp; + } + } + } + } + } + } +} + +template +void CommonElementwiseBroadcastBackward(const CPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + int max_dim = std::max(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + // for inplace strategy. memset will make dx and dout clear and get wrong + // result. + if (dx && dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x_dims, ctx.GetPlace()); + } + + VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" + << phi::make_ddim(x_dims_array) + << " ydim:" << phi::make_ddim(y_dims_array); + + CommonGradBroadcastCPU(x, + y, + out, + dout, + dx, + dy, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + ctx, + dx_op, + dy_op); +} + +template +void ElemwiseGradComputeWithBroadcast(const CPUContext &ctx, + const DDim &x_dims, + const DDim &y_dims, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + bool is_xsize_larger = true; + + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common backward implementation. + if (is_run_common_broadcast) { + CommonElementwiseBroadcastBackward( + ctx, x_dims, y_dims, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + return; + } + if (post == 1) { + ElemwiseGradBroadcast1CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); + } else { + ElemwiseGradBroadcast2CPU(x.data(), + y.data(), + out.data(), + dout.data(), + pre, + n, + post, + is_xsize_larger, + dx_op, + dy_op, + dx == nullptr ? nullptr : ctx.Alloc(dx), + dy == nullptr ? nullptr : ctx.Alloc(dy)); + } +} + +template +struct ElemwiseGradNoBroadcast { + const T *x_; + const T *y_; + const Tout *out_; + const Tout *dout_; + + HOSTDEVICE void operator()(size_t i) { + if (dx_ != nullptr) { + dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); + } + if (dy_ != nullptr) { + dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]); + } + } + + DX_OP dx_op_; + DY_OP dy_op_; + T *dx_; + T *dy_; +}; + +template +void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, + const DDim &x_dim, + const DDim &y_dim, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + size_t N = static_cast(phi::product(x_dim)); + phi::funcs::ForRange for_range(dev_ctx, N); + for_range(ElemwiseGradNoBroadcast{ + x.data(), + y.data(), + out.data(), + dout.data(), + dx_op, + dy_op, + dx == nullptr ? nullptr : dev_ctx.template Alloc(dx), + dy == nullptr ? nullptr : dev_ctx.template Alloc(dy)}); } +#if defined(__NVCC__) || defined(__HIPCC__) // Suppose only has contiguous dims static inline bool CheckContiguousDims(const std::vector &broadcast_pos) { for (int i = 1; i < broadcast_pos.size(); ++i) { @@ -114,7 +463,6 @@ inline void ComputeBroadcastKernelSize(int *x_dims_array, } } -#ifndef __xpu__ template static __global__ void FastCommonGradBroadcastOneCUDAKernel(const T *x, const T *y, @@ -1282,13 +1630,13 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx, std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); - funcs::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); + GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); // for inplace strategy. memset will make dx and dout clear and get wrong // result. if (dx && dx->IsSharedBufferWith(dout)) { @@ -1340,37 +1688,37 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, PADDLE_ENFORCE_GE( axis, 0, - phi::errors::InvalidArgument( + errors::InvalidArgument( "Axis should be great than or equal to 0, but received axis is %d.", axis)); PADDLE_ENFORCE_LT(axis, max_dim, - phi::errors::InvalidArgument( + errors::InvalidArgument( "Axis should be less than %d, but received axis is %d.", max_dim, axis)); int pre, n, post, is_run_common_broadcast, axis_trim = 0; if (is_xsize_larger) { - auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + auto y_dims_trimed = TrimTrailingSingularDims(y_dims); axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - funcs::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); + GetMidDims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); } else { - auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + auto x_dims_trimed = TrimTrailingSingularDims(x_dims); axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - funcs::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); + GetMidDims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); } // special case for common backward implementation. if (is_run_common_broadcast) { @@ -1408,228 +1756,7 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, } } -/* -****************************** - Add Grad -****************************** -*/ - -template -static __global__ void SimpleElemwiseAddGradCUDAKernel( - const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { - int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; - int stride = GRID_NUM_X * BLOCK_NUM_X; - int loop = size / vec_size; - int remainder = size % vec_size; - const float4 *dout_vec = reinterpret_cast(dout); - float4 *dx_vec = reinterpret_cast(dx); - float4 *dy_vec = reinterpret_cast(dy); - float4 tmp_loop; - - for (int i = tid; i < loop; i += stride) { - tmp_loop = dout_vec[i]; - dx_vec[i] = tmp_loop; - dy_vec[i] = tmp_loop; - } - - if (tid == loop && remainder != 0) { - T tmp_rem; - while (remainder) { - int idx = size - remainder; - remainder--; - tmp_rem = dout[idx]; - dx[idx] = tmp_rem; - dy[idx] = tmp_rem; - } - } -} - -template -void default_elementwise_add_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy, - int axis = -1) { - auto *dout_data = dout.data(); - - // dx - if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - if (dx->dims() == dout.dims()) { - if (dx_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dout, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); - } - std::vector reduce_dims = - funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); - } - } - // dy - if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - if (dy->dims() == dout.dims()) { - if (dy_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); - } - } else { - std::vector reduce_dims = - funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); - } - } -} - -template -void elementwise_add_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - auto *dout_data = dout.data(); - if (dx_data == dout_data && dy_data != dout_data) { - VLOG(4) << "Special case when dx_data is the same as dout_data, " - "only need copy dout to dy"; - phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); - } else if (dx_data != dout_data && dy_data == dout_data) { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "only need copy dout to dx"; - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } else if (dx_data != dout_data && dy_data != dout_data) { - auto size = x.numel(); - int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - dim3 grid_size = - dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / - PREDEFINED_BLOCK_SIZE, - 1); - SimpleElemwiseAddGradCUDAKernel< - T><<>>( - dout.data(), - size, - vec_size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); - } else { - VLOG(4) << "Special case when dy_data is the same as dout_data, " - "and dx_data is the same as dout_data, do not need " - "any operator"; - } -} - -/* -****************************** - Sub Grad -****************************** -*/ - -template -static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout, - int64_t size, - T *dx, - T *dy) { - int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; - - while (col < size) { - if (dx != nullptr) { - dx[col] = dout[col]; - } - dy[col] = -dout[col]; - col += BLOCK_NUM_X * GRID_NUM_X; - } -} - -template -void default_elementwise_sub_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy, - int axis = -1) { - auto *dout_data = dout.data(); - // dx - if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); - if (dx->dims() == dout.dims()) { - if (dx_data != dout_data) { - phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dout, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(dout)) { - dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); - } - std::vector reduce_dims = - funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); - } - } - // dy - if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); - if (dy->dims() == dout.dims()) { - if (dy_data != dout_data) { - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - auto size = dy->numel(); - dim3 grid_size = - dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); - SimpleElemwiseSubGradCUDAKernel< - T><<>>( - dout.data(), size, nullptr, dy->mutable_data(ctx.GetPlace())); - } - } else { - std::vector reduce_dims = - funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - kernels::TensorReduceImpl>( - ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); - } - } -} - -template -void elementwise_sub_grad(const GPUContext &ctx, - const DenseTensor &x, - const DenseTensor &y, - const DenseTensor &out, - const DenseTensor &dout, - DenseTensor *dx, - DenseTensor *dy) { - dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); - auto size = x.numel(); - dim3 grid_size = - dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); - SimpleElemwiseSubGradCUDAKernel< - T><<>>( - dout.data(), - size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); -} - #endif +} // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_utils.h b/paddle/phi/kernels/funcs/elementwise_utils.h new file mode 100644 index 0000000000000..3790044346dc4 --- /dev/null +++ b/paddle/phi/kernels/funcs/elementwise_utils.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { + +namespace funcs { + +using DDim = phi::DDim; + +/* + * Out = X ⊙ Y + * If Y's shape does not match X' shape, they will be reshaped. + * For example: + * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1 + * pre=2, n=3*4, post=5 + * x.shape(2, 12, 5) * y.shape(1, 12, 1).broadcast(2, 12, 5) + * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) + * pre=2*3, n=4*5, post=1 + * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) + * + * New parameter: *is_run_common_broadcast* is a flag to record whether to run + * common broadcast code. + */ +inline void GetMidDims(const DDim &x_dims, + const DDim &y_dims, + const int axis, + int *pre, + int *n, + int *post, + int *is_run_common_broadcast) { + *pre = 1; + *n = 1; + *post = 1; + *is_run_common_broadcast = 0; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + if (x_dims[i + axis] != y_dims[i]) { + PADDLE_ENFORCE_EQ(y_dims[i] == 1 || x_dims[i + axis] == 1, + true, + phi::errors::InvalidArgument( + "Broadcast dimension mismatch. Operands " + "could not be broadcast together with the shape of " + "X = [%s] and the shape of Y = [%s]. Received [%d] " + "in X is not equal to [%d] in Y.", + x_dims, + y_dims, + x_dims[i + axis], + y_dims[i])); + *is_run_common_broadcast = 1; + return; + } + (*n) *= y_dims[i]; + } + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } +} + +inline DDim TrimTrailingSingularDims(const DDim &dims) { + // Remove trailing dimensions of size 1 for y + auto actual_dims_size = dims.size(); + for (; actual_dims_size != 0; --actual_dims_size) { + if (dims[actual_dims_size - 1] != 1) break; + } + if (actual_dims_size == dims.size()) return dims; + std::vector trim_dims; + trim_dims.resize(actual_dims_size); + for (int i = 0; i < actual_dims_size; ++i) { + trim_dims[i] = dims[i]; + } + if (trim_dims.size() == 0) { + return DDim(phi::make_dim()); + } + DDim actual_dims = phi::make_ddim(trim_dims); + return actual_dims; +} + +inline int GetElementwiseIndex(const int *x_dims_array, + const int max_dim, + const int *index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +inline void UpdateElementwiseIndexArray(const int *out_dims_array, + const int max_dim, + int *index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; + } else { + break; + } + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h new file mode 100644 index 0000000000000..b17196b6b1156 --- /dev/null +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -0,0 +1,246 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_grad_base.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +/* +****************************** + Add Grad +****************************** +*/ + +template +static __global__ void SimpleElemwiseAddGradCUDAKernel( + const T *__restrict__ dout, int size, int vec_size, T *dx, T *dy) { + int tid = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; + int stride = GRID_NUM_X * BLOCK_NUM_X; + int loop = size / vec_size; + int remainder = size % vec_size; + const float4 *dout_vec = reinterpret_cast(dout); + float4 *dx_vec = reinterpret_cast(dx); + float4 *dy_vec = reinterpret_cast(dy); + float4 tmp_loop; + + for (int i = tid; i < loop; i += stride) { + tmp_loop = dout_vec[i]; + dx_vec[i] = tmp_loop; + dy_vec[i] = tmp_loop; + } + + if (tid == loop && remainder != 0) { + T tmp_rem; + while (remainder) { + int idx = size - remainder; + remainder--; + tmp_rem = dout[idx]; + dx[idx] = tmp_rem; + dy[idx] = tmp_rem; + } + } +} + +template +void DefaultElementwiseAddGrad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + auto *dout_data = dout.data(); + + // dx + if (dx != nullptr) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout.dims()) { + if (dx_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x.dims(), ctx.GetPlace()); + } + std::vector reduce_dims = + funcs::GetReduceDim(x.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout.dims()) { + if (dy_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); + } + } else { + std::vector reduce_dims = + funcs::GetReduceDim(y.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + } + } +} + +template +void ElementwiseAddGrad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy) { + ctx.template Alloc(dx); + ctx.template Alloc(dy); + auto *dx_data = dx->data(); + auto *dy_data = dy->data(); + auto *dout_data = dout.data(); + if (dx_data == dout_data && dy_data != dout_data) { + VLOG(4) << "Special case when dx_data is the same as dout_data, " + "only need copy dout to dy"; + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); + } else if (dx_data != dout_data && dy_data == dout_data) { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "only need copy dout to dx"; + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } else if (dx_data != dout_data && dy_data != dout_data) { + auto size = x.numel(); + int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + dim3 grid_size = + dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) / + PREDEFINED_BLOCK_SIZE, + 1); + SimpleElemwiseAddGradCUDAKernel< + T><<>>( + dout.data(), + size, + vec_size, + dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); + } else { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "and dx_data is the same as dout_data, do not need " + "any operator"; + } +} + +/* +****************************** + Sub Grad +****************************** +*/ + +template +static __global__ void SimpleElemwiseSubGradCUDAKernel(const T *dout, + int64_t size, + T *dx, + T *dy) { + int col = BLOCK_ID_X * BLOCK_NUM_X + THREAD_ID_X; + + while (col < size) { + if (dx != nullptr) { + dx[col] = dout[col]; + } + dy[col] = -dout[col]; + col += BLOCK_NUM_X * GRID_NUM_X; + } +} + +template +void default_elementwise_sub_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + auto *dout_data = dout.data(); + // dx + if (dx != nullptr) { + auto *dx_data = dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout.dims()) { + if (dx_data != dout_data) { + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); + } + } else { + // For inplace strategy, dx will be stored in addr of dout, which makes + // the result of dy wrong. + if (dx->IsSharedBufferWith(dout)) { + dx->clear(); + dx->mutable_data(x.dims(), ctx.GetPlace()); + } + std::vector reduce_dims = + funcs::GetReduceDim(x.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + } + } + // dy + if (dy != nullptr) { + auto *dy_data = dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout.dims()) { + if (dy_data != dout_data) { + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + auto size = dy->numel(); + dim3 grid_size = + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); + SimpleElemwiseSubGradCUDAKernel< + T><<>>( + dout.data(), size, nullptr, dy->mutable_data(ctx.GetPlace())); + } + } else { + std::vector reduce_dims = + funcs::GetReduceDim(y.dims(), out.dims(), axis); + gpuStream_t stream = ctx.stream(); + kernels::TensorReduceImpl>( + ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); + } + } +} + +template +void elementwise_sub_grad(const GPUContext &ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy) { + dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); + auto size = x.numel(); + dim3 grid_size = + dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); + SimpleElemwiseSubGradCUDAKernel< + T><<>>( + dout.data(), + size, + dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); +} +} // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 3c4c01b1dc8ff..d00888aee6701 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -17,8 +17,9 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" namespace phi { @@ -33,9 +34,9 @@ void AddGradFunc(const GPUContext& dev_ctx, DenseTensor* dy, int axis = -1) { if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) { - elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy); + ElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy); } else { - default_elementwise_add_grad(dev_ctx, x, y, out, dout, dx, dy, axis); + DefaultElementwiseAddGrad(dev_ctx, x, y, out, dout, dx, dy, axis); } } @@ -58,15 +59,7 @@ void AddDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::AddDoubleGradImpl(dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>, - ElementwiseCompute, T>); + phi::AddDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } template @@ -106,15 +99,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, const DenseTensor& dout, int axis, DenseTensor* ddout) { - phi::SubtractDoubleGradImpl( - dev_ctx, - y, - ddx, - ddy, - dout, - axis, - ddout, - ElementwiseCompute, T>); + phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu index f32d4c77d4059..1c0bafc932ee8 100644 --- a/paddle/phi/kernels/gpu/logical_kernel.cu +++ b/paddle/phi/kernels/gpu/logical_kernel.cu @@ -16,9 +16,8 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/logical_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" namespace phi { diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index fc73ccca6de18..af9d5574aa9fe 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/phi/kernels/math_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/gpu/elementwise.h" #include "paddle/phi/kernels/gpu/reduce.h" #ifdef __NVCC__ diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 460e74b58166a..ac7d6fd1a0e9c 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" namespace phi { @@ -47,19 +47,14 @@ void AddGradImpl(const Context& dev_ctx, } } -template +template void AddDoubleGradImpl(const Context& dev_ctx, const DenseTensor& y, const paddle::optional& ddx, const paddle::optional& ddy, const DenseTensor& dout, int axis, - DenseTensor* ddout, - GradFunc grad_func, - GradInverseFunc grad_inverse_func) { + DenseTensor* ddout) { // ddOut = ddx + ddy if (ddout) { DenseTensor ddx_safe, ddy_safe; @@ -72,28 +67,28 @@ void AddDoubleGradImpl(const Context& dev_ctx, auto ddx_dims = ddx_safe.dims(); auto ddy_dims = ddy_safe.dims(); if (ddx_dims.size() >= ddy_dims.size()) { - grad_func( + funcs::ElementwiseCompute, T>( dev_ctx, ddx_safe, ddy_safe, axis, funcs::AddFunctor(), ddout); } else { - grad_inverse_func(dev_ctx, - ddx_safe, - ddy_safe, - axis, - funcs::InverseAddFunctor(), - ddout); + funcs::ElementwiseCompute, T>( + dev_ctx, + ddx_safe, + ddy_safe, + axis, + funcs::InverseAddFunctor(), + ddout); } } } -template +template void SubtractDoubleGradImpl(const Context& dev_ctx, const DenseTensor& y, const paddle::optional& ddx, const paddle::optional& ddy, const DenseTensor& dout, int axis, - DenseTensor* ddout, - GradFunc grad_func) { + DenseTensor* ddout) { // DDOut = ddx - ddy if (ddout) { DenseTensor ddx_safe, ddy_safe; @@ -103,7 +98,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, dev_ctx, y, ddy.get_ptr(), &ddy_safe); ddout->mutable_data(dev_ctx.GetPlace()); - grad_func( + funcs::ElementwiseCompute, T>( dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor(), ddout); } } From 2e6548a9cd2224e1a4b89c1351f1078273f98328 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 2 Mar 2022 18:40:00 +0800 Subject: [PATCH 22/41] vec scale kernel (#40011) --- .../optimizers/distributed_fused_lamb_op.cu | 49 +++++++++++++++---- 1 file changed, 39 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index ca0828a6f6ab7..8bb4606ffff15 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -304,14 +304,30 @@ struct AndFunctor { HOSTDEVICE bool operator()(bool x, bool y) const { return x && y; } }; -template +template static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, const T2 *__restrict__ scale, T1 *__restrict__ y, int num) { static_assert(sizeof(T1) <= sizeof(T2), "sizeof(T1) must be not greater than sizeof(T2)."); T2 s = scale[0]; - CUDA_KERNEL_LOOP(i, num) { + + int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize; + int stride = blockDim.x * gridDim.x * VecSize; + + for (; i + VecSize <= num; i += stride) { + platform::AlignedVector x_vec; + platform::AlignedVector y_vec; + + platform::Load(x + i, &x_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + y_vec[j] = static_cast(static_cast(x_vec[j]) * s); + } + platform::Store(y_vec, y + i); + } + + for (; i < num; ++i) { y[i] = static_cast(static_cast(x[i]) * s); } } @@ -396,7 +412,6 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( for (; i + VecSize <= num; i += stride) { platform::AlignedVector param_vec; platform::AlignedVector grad_vec; - platform::AlignedVector weight_decay_vec; platform::AlignedVector mom1_vec; platform::AlignedVector mom2_vec; platform::AlignedVector trust_ratio_div_vec; @@ -760,6 +775,24 @@ static bool CreatePreMulScaleOpIfSupported(ncclDataType_t dtype, return false; } +template +static void LaunchScaleKernel(const platform::CUDADeviceContext &dev_ctx, + const T1 *x, const T2 *scale, T1 *y, int n, + gpuStream_t stream) { + int vec_size = std::min(GetChunkedVecSize(x, 0), GetChunkedVecSize(y, 0)); + auto config = platform::GetGpuLaunchConfig1D(dev_ctx, n, vec_size); + +#define PD_LAMB_VEC_SCALE_KERNEL_CASE \ + do { \ + ScaleCUDAKernel<<>>( \ + x, scale, y, n); \ + } while (0) + + PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAMB_VEC_SCALE_KERNEL_CASE); +#undef PD_LAMB_VEC_SCALE_KERNEL_CASE +} + template static void NCCLReduceScatterWithScale( const T *sendbuff, T *recvbuff, size_t recvcount, size_t nranks, @@ -775,10 +808,8 @@ static void NCCLReduceScatterWithScale( PADDLE_ENFORCE_EQ(nranks, 1, platform::errors::InvalidArgument( "nranks must be 1 when scale != nullptr.")); - auto numel = recvcount * nranks; - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, recvbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, recvbuff, recvcount * nranks, + stream); } return; } @@ -792,9 +823,7 @@ static void NCCLReduceScatterWithScale( if (scale && !should_destroy_op) { size_t numel = recvcount * nranks; T *new_sendbuff = buffer.Alloc(numel); - auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel); - ScaleCUDAKernel<<>>(sendbuff, scale, new_sendbuff, numel); + LaunchScaleKernel(dev_ctx, sendbuff, scale, new_sendbuff, numel, stream); sendbuff = new_sendbuff; } From 09258040e2584f4afd9114b994710232e6769970 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Wed, 2 Mar 2022 18:50:26 +0800 Subject: [PATCH 23/41] Move gather.h/gather.cu.h/scatter.h/scatter.cu.h to the phi library (#40043) * move gather.h gather.cu.h scatter.h scatter.cu.h to phi library * fix CI * fix rocm ci --- .../fluid/operators/detection/bbox_util.cu.h | 1 - .../detection/collect_fpn_proposals_op.cu | 10 +- .../detection/collect_fpn_proposals_op.h | 6 +- .../detection/distribute_fpn_proposals_op.cu | 5 +- .../detection/distribute_fpn_proposals_op.h | 15 +- .../detection/generate_mask_labels_op.cc | 1 - .../detection/generate_proposal_labels_op.cc | 16 +- .../detection/generate_proposals_op.cc | 18 +- .../detection/generate_proposals_op.cu | 9 +- .../detection/generate_proposals_v2_op.cc | 18 +- .../detection/generate_proposals_v2_op.cu | 9 +- paddle/fluid/operators/gather_nd_op.cu | 94 +++++----- paddle/fluid/operators/gather_nd_op.h | 66 ++++--- paddle/fluid/operators/gather_op.cu | 32 ++-- paddle/fluid/operators/gather_op.h | 68 +++---- paddle/fluid/operators/gather_test.cc | 4 +- paddle/fluid/operators/grid_sampler_op.h | 1 - .../fluid/operators/math/segment_pooling.cu | 6 +- paddle/fluid/operators/scatter_nd_add_op.cu | 41 ++-- paddle/fluid/operators/scatter_nd_add_op.h | 41 ++-- paddle/fluid/operators/scatter_op.cu | 50 +++-- paddle/fluid/operators/scatter_op.h | 63 +++---- paddle/fluid/operators/scatter_test.cc | 4 +- paddle/fluid/operators/segment_pool_op.cu | 1 - .../sequence_ops/sequence_scatter_op.cc | 2 - .../sequence_ops/sequence_scatter_op.h | 3 +- paddle/fluid/operators/viterbi_decode_op.cu | 38 ++-- paddle/fluid/operators/viterbi_decode_op.h | 128 +++++++------ .../kernels/funcs}/gather.cu.h | 176 +++++++++++------- .../operators => phi/kernels/funcs}/gather.h | 114 +++++++----- .../kernels/funcs}/scatter.cu.h | 124 ++++++------ .../operators => phi/kernels/funcs}/scatter.h | 165 ++++++++-------- 32 files changed, 702 insertions(+), 627 deletions(-) rename paddle/{fluid/operators => phi/kernels/funcs}/gather.cu.h (62%) rename paddle/{fluid/operators => phi/kernels/funcs}/gather.h (72%) rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.cu.h (67%) rename paddle/{fluid/operators => phi/kernels/funcs}/scatter.h (65%) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index b361bc3ab75e8..f170fbbe4b534 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -23,7 +23,6 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index ce9ac3de4e78c..860fdd01794cc 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -23,11 +23,11 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" namespace paddle { namespace operators { @@ -160,9 +160,9 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); Tensor sorted_batch_id; sorted_batch_id.mutable_data({real_post_num}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); - GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, - &sorted_batch_id); + phi::funcs::GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); + phi::funcs::GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, + &sorted_batch_id); Tensor batch_index_t; int* batch_idx_in = @@ -190,7 +190,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, sizeof(int) * 8, dev_ctx.stream()); - GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); + phi::funcs::GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); Tensor length_lod; int* length_lod_data = diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h index a60f881ebf3e3..e5ae9a6ccbda5 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h @@ -21,7 +21,6 @@ limitations under the License.*/ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -66,7 +65,8 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { auto multi_layer_scores = context.MultiInput("MultiLevelScores"); - auto multi_rois_num = context.MultiInput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiInput("MultiLevelRoIsNum"); int num_size = multi_rois_num.size(); auto* fpn_rois = context.Output("FpnRois"); @@ -176,7 +176,7 @@ class CollectFpnProposalsOpKernel : public framework::OpKernel { } num_per_batch.emplace_back(post_nms_topN - pre_idx); if (context.HasOutput("RoisNum")) { - auto* rois_num = context.Output("RoisNum"); + auto* rois_num = context.Output("RoisNum"); int* rois_num_data = rois_num->mutable_data({batch_size}, context.GetPlace()); for (int i = 0; i < batch_size; i++) { diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index c117fbd70f528..7ad25e003b491 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -24,9 +24,9 @@ namespace cub = hipcub; #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -193,7 +193,8 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { start = end; multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); - GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + phi::funcs::GPUGather(dev_ctx, *fpn_rois, sub_idx, + multi_fpn_rois[i]); } else { multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, dev_ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h index 628cbcd761186..5479e08c2a5ef 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -28,10 +27,11 @@ namespace operators { const int kBoxDim = 4; -inline std::vector GetLodFromRoisNum(const Tensor* rois_num) { +inline std::vector GetLodFromRoisNum( + const framework::Tensor* rois_num) { std::vector rois_lod; auto* rois_num_data = rois_num->data(); - Tensor cpu_tensor; + framework::Tensor cpu_tensor; if (platform::is_gpu_place(rois_num->place())) { paddle::framework::TensorCopySync(*rois_num, platform::CPUPlace(), &cpu_tensor); @@ -93,7 +93,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector fpn_rois_lod; int fpn_rois_num; if (context.HasInput("RoisNum")) { - auto* rois_num = context.Input("RoisNum"); + auto* rois_num = context.Input("RoisNum"); fpn_rois_lod = GetLodFromRoisNum(rois_num); } else { fpn_rois_lod = fpn_rois->lod().back(); @@ -105,7 +105,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector num_rois_level(num_level, 0); std::vector num_rois_level_integral(num_level + 1, 0); for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { @@ -140,7 +140,7 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { std::vector restore_index_inter(fpn_rois_num, -1); // distribute the rois into different fpn level by target level for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) { - Tensor fpn_rois_slice = + auto fpn_rois_slice = fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); const T* rois_data = fpn_rois_slice.data(); size_t cur_offset = fpn_rois_lod[i]; @@ -163,7 +163,8 @@ class DistributeFpnProposalsOpKernel : public framework::OpKernel { for (int i = 0; i < fpn_rois_num; ++i) { restore_index_data[restore_index_inter[i]] = i; } - auto multi_rois_num = context.MultiOutput("MultiLevelRoIsNum"); + auto multi_rois_num = + context.MultiOutput("MultiLevelRoIsNum"); if (multi_rois_num.size() > 0) { int batch_size = fpn_rois_lod.size() - 1; for (int i = 0; i < num_level; ++i) { diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index e6af1a5bbf71c..c9cc4e722071c 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/mask_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 424aa0714400d..cbf17048400bf 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -281,22 +281,22 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context, Tensor fg_boxes, bg_boxes, fg_labels, bg_labels; fg_boxes.mutable_data({fg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, fg_inds_t, &fg_boxes); + phi::funcs::CPUGather(context, boxes, fg_inds_t, &fg_boxes); bg_boxes.mutable_data({bg_num, kBoxDim}, context.GetPlace()); - CPUGather(context, boxes, bg_inds_t, &bg_boxes); + phi::funcs::CPUGather(context, boxes, bg_inds_t, &bg_boxes); Concat(context, fg_boxes, bg_boxes, sampled_boxes); - CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); + phi::funcs::CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); fg_labels.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); + phi::funcs::CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); bg_labels.mutable_data({bg_num}, context.GetPlace()); phi::funcs::set_constant(context, &bg_labels, 0); Concat(context, fg_labels, bg_labels, sampled_labels); Tensor fg_max_overlap, bg_max_overlap; fg_max_overlap.mutable_data({fg_num}, context.GetPlace()); - CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, fg_inds_t, &fg_max_overlap); bg_max_overlap.mutable_data({bg_num}, context.GetPlace()); - CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); + phi::funcs::CPUGather(context, max_overlap, bg_inds_t, &bg_max_overlap); Concat(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap); } @@ -334,7 +334,7 @@ std::vector SampleRoisForOneImage( } else { proposals_num = keep.numel(); roi_filter.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); - CPUGather(context, rpn_rois, keep, &roi_filter); + phi::funcs::CPUGather(context, rpn_rois, keep, &roi_filter); } T* roi_filter_dt = roi_filter.data(); memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T)); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index 8c4bd4ac61320..d6130823271f0 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -196,10 +196,10 @@ class GenerateProposalsKernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -223,8 +223,8 @@ class GenerateProposalsKernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -237,8 +237,8 @@ class GenerateProposalsKernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 6e3c322c17483..5fb7973fd89e4 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -85,8 +86,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -102,8 +103,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc index 6351ea865cd0e..1f1802574c5b8 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/detection/bbox_util.h" #include "paddle/fluid/operators/detection/nms_util.h" -#include "paddle/fluid/operators/gather.h" +#include "paddle/phi/kernels/funcs/gather.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -197,10 +197,10 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); - CPUGather(ctx, scores_slice, index_t, &scores_sel); - CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); - CPUGather(ctx, anchors, index_t, &anchor_sel); - CPUGather(ctx, variances, index_t, &var_sel); + phi::funcs::CPUGather(ctx, scores_slice, index_t, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + phi::funcs::CPUGather(ctx, anchors, index_t, &anchor_sel); + phi::funcs::CPUGather(ctx, variances, index_t, &var_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); @@ -227,8 +227,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { Tensor scores_filter; bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, proposals, keep, &bbox_sel); - CPUGather(ctx, scores_sel, keep, &scores_filter); + phi::funcs::CPUGather(ctx, proposals, keep, &bbox_sel); + phi::funcs::CPUGather(ctx, scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -242,8 +242,8 @@ class GenerateProposalsV2Kernel : public framework::OpKernel { proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - CPUGather(ctx, bbox_sel, keep_nms, &proposals); - CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + phi::funcs::CPUGather(ctx, bbox_sel, keep_nms, &proposals); + phi::funcs::CPUGather(ctx, scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu index 93ba3deca5fc4..005309e8ee577 100644 --- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/bbox_util.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -86,8 +87,8 @@ static std::pair ProposalForOneImage( } proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); - GPUGather(ctx, proposals, keep_index, &proposals_filter); - GPUGather(ctx, scores_sort, keep_index, &scores_filter); + phi::funcs::GPUGather(ctx, proposals, keep_index, &proposals_filter); + phi::funcs::GPUGather(ctx, scores_sort, keep_index, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(proposals_filter, scores_filter); @@ -104,8 +105,8 @@ static std::pair ProposalForOneImage( Tensor scores_nms, proposals_nms; proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); - GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); - GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + phi::funcs::GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + phi::funcs::GPUGather(ctx, scores_filter, keep_nms, &scores_nms); return std::make_pair(proposals_nms, scores_nms); } diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu index 0de2798bf7509..338c441161834 100644 --- a/paddle/fluid/operators/gather_nd_op.cu +++ b/paddle/fluid/operators/gather_nd_op.cu @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_nd_op.h" -#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { -template +template class GatherNdOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -33,27 +33,25 @@ class GatherNdOpCUDAKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - GPUGatherNd(ctx, *x, *index, output); + const auto &index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); } } }; -template +template class GatherNdGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -71,24 +69,22 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + const auto &index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterNdAdd(ctx, *dO, *index, dX); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); } } }; @@ -98,18 +94,16 @@ class GatherNdGradOpCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_nd_grad, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h index f458c0e18013b..d54261008e47b 100644 --- a/paddle/fluid/operators/gather_nd_op.h +++ b/paddle/fluid/operators/gather_nd_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -38,22 +38,20 @@ class GatherNdOpKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGatherNd(ctx.device_context(), *x, *index, output); + auto index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); } } }; @@ -65,6 +63,7 @@ class GatherNdGradOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( platform::is_cpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); + auto *index = ctx.Input("Index"); auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); @@ -75,22 +74,21 @@ class GatherNdGradOpKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *dO, *index, dX); - } else if (index_type == framework::proto::VarType::INT64) { - ScatterNdAdd(ctx, *dO, *index, dX); + auto index_type = index->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s]", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); } } }; diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index a502a13040949..8f1d9284c5038 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -49,11 +49,14 @@ class GatherOpCUDAKernel : public framework::OpKernel { } const auto &place = ctx.GetPlace(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &dev_ctx = ctx.cuda_device_context(); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2CUDAFunction(x, index, axis, output, place, ctx); + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -61,9 +64,9 @@ class GatherOpCUDAKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { - GPUGather(ctx.device_context(), *x, *index, output); + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -93,14 +96,15 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } } + const auto &dev_ctx = ctx.cuda_device_context(); const auto &index_type = framework::TransToProtoVarType(index->dtype()); if (axis != 0) { if (index_type == framework::proto::VarType::INT32) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - ctx.GetPlace(), ctx); + phi::funcs::GatherV2GradCUDAFunction(dO, index, axis, dX, + dev_ctx); } return; } @@ -112,11 +116,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } else if (index_type == framework::proto::VarType::INT64) { - GPUScatterAssign(ctx, *dO, *index, dX, - ctx.Attr("overwrite")); + phi::funcs::GPUScatterAssign(dev_ctx, *dO, *index, dX, + ctx.Attr("overwrite")); } } }; diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 016c2b398daaa..94de694b2f9bc 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -40,31 +40,32 @@ class GatherOpKernel : public framework::OpKernel { // get axis from tensor if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &place = ctx.GetPlace(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2Function(x, index, axis, output, place); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2Function(x, index, axis, output, place); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2Function(dev_ctx, x, index, axis, + output); } return; } output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *x, *index, output); - } else if (index_type == framework::proto::VarType::INT64) { - CPUGather(ctx.device_context(), *x, *index, output); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::CPUGather(dev_ctx, *x, *index, output); } } }; @@ -84,44 +85,45 @@ class GatherGradientOpKernel : public framework::OpKernel { int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { const Tensor *axis_tensor = ctx.Input("Axis"); - const auto &axis_type = - framework::TransToProtoVarType(axis_tensor->dtype()); - if (axis_type == framework::proto::VarType::INT32) { + const auto &axis_type = axis_tensor->dtype(); + if (axis_type == phi::DataType::INT32) { axis = static_cast(axis_tensor->data()[0]); - } else if (axis_type == framework::proto::VarType::INT64) { + } else if (axis_type == phi::DataType::INT64) { axis = static_cast(axis_tensor->data()[0]); } } - const auto &index_type = framework::TransToProtoVarType(index->dtype()); + const auto &index_type = index->dtype(); + auto &dev_ctx = ctx.template device_context(); if (axis != 0) { - if (index_type == framework::proto::VarType::INT32) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); - } else if (index_type == framework::proto::VarType::INT64) { - GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); + if (index_type == phi::DataType::INT32) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); + } else if (index_type == phi::DataType::INT64) { + phi::funcs::GatherV2GradFunction(dev_ctx, dO, index, axis, + dX); } return; } dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); + auto &place = *dev_ctx.eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; bool overwrite = ctx.Attr("overwrite"); - if (index_type == framework::proto::VarType::INT32) { + if (index_type == phi::DataType::INT32) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } - } else if (index_type == framework::proto::VarType::INT64) { + } else if (index_type == phi::DataType::INT64) { if (overwrite) { - ScatterAssign(ctx.device_context(), *dO, *index, dX); + phi::funcs::ScatterAssign(dev_ctx, *dO, *index, dX); } else { - ScatterAssignAdd(ctx, *dO, *index, dX); + phi::funcs::ScatterAssignAdd(dev_ctx, *dO, *index, dX); } } } diff --git a/paddle/fluid/operators/gather_test.cc b/paddle/fluid/operators/gather_test.cc index 0f3dcdadcf897..c962dd065234f 100644 --- a/paddle/fluid/operators/gather_test.cc +++ b/paddle/fluid/operators/gather_test.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/kernels/funcs/gather.h" TEST(Gather, GatherData) { paddle::framework::Tensor* src = new paddle::framework::Tensor(); @@ -39,7 +39,7 @@ TEST(Gather, GatherData) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::CPUGather(ctx, *src, *index, output); + phi::funcs::CPUGather(ctx, *src, *index, output); delete cpu_place; cpu_place = NULL; for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index 8f3c6660f51c4..93e96694270a4 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index bb6d8756bd0a3..fbdcb99c02ab9 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/math/segment_pooling.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -379,9 +379,9 @@ class SegmentPoolGradFunctor { SimpleDiv<<>>(mean_grad.data(), summed_ids->data(), len, dim); - GPUGather(context, mean_grad, segments, in_grad); + phi::funcs::GPUGather(context, mean_grad, segments, in_grad); } else if (pooltype == "SUM") { - GPUGather(context, out_grad, segments, in_grad); + phi::funcs::GPUGather(context, out_grad, segments, in_grad); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported segment pooling operation, Only MEAN, SUM, MAX, MIN " diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu index 6448f8cc4056d..2fe3fcb759d34 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cu +++ b/paddle/fluid/operators/scatter_nd_add_op.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" #include "paddle/fluid/operators/scatter_nd_add_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -33,22 +33,20 @@ class ScatterNdAddOpCUDAKernel : public framework::OpKernel { auto *Out = ctx.Output("Out"); framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } else { - GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } } }; @@ -69,12 +67,13 @@ class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.cuda_device_context(); // Gradient by Gather - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + const auto &index_type = Ids->dtype(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } else { - GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h index 2bdf9ec58a850..81c95fe55abaa 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.h +++ b/paddle/fluid/operators/scatter_nd_add_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -37,23 +37,21 @@ class ScatterNdAddOpKernel : public framework::OpKernel { // In place output: Out = X framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s], but " + "desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); - if (index_type == framework::proto::VarType::INT32) { - ScatterNdAdd(ctx, *Updates, *Ids, Out); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } else { - ScatterNdAdd(ctx, *Updates, *Ids, Out); + phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); } } }; @@ -76,11 +74,12 @@ class ScatterNdAddGradientOpKernel : public framework::OpKernel { if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - if (index_type == framework::proto::VarType::INT32) { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + const auto &index_type = Ids->dtype(); + auto &dev_ctx = ctx.template device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } else { - CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index 549e30803b464..7755e376bc195 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter.cu.h" #include "paddle/fluid/operators/scatter_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" +#include "paddle/phi/kernels/funcs/scatter.cu.h" namespace paddle { namespace operators { @@ -35,23 +35,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel { framework::TensorCopy(*X, ctx.GetPlace(), Out); // use template class to support int32_t and int64_t - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + auto index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op Index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, + overwrite); } else { - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, + overwrite); } } }; @@ -68,36 +67,33 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + auto index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.cuda_device_context(); if (dX) { framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); } else { - GPUScatterGradForX(ctx.device_context(), *Ids, dX); + phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); } } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (index_type == phi::DataType::INT32) { + phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); } else { - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 69ab6c7135cd5..7733181a93fb6 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/gather.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { @@ -39,29 +39,27 @@ class ScatterOpKernel : public framework::OpKernel { // In place output: Out = X, Out[Ids] = Updates framework::TensorCopy(*X, ctx.GetPlace(), Out); // Apply ScatterUpdate: Out[index] = Updates[:] - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + platform::errors::InvalidArgument( + "Index holds the wrong type, it holds [%s]," + "but desires to be [%s] or [%s].", + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); if (overwrite) { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); } else { - ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); + phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); } } else { - if (index_type == framework::proto::VarType::INT32) { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); + if (index_type == phi::DataType::INT32) { + phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); } else { - ScatterAssignAdd(ctx, *Updates, *Ids, Out); + phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); } } } @@ -79,36 +77,33 @@ class ScatterGradientOpKernel : public framework::OpKernel { auto *Ids = ctx.Input("Ids"); auto *dOut = ctx.Input(framework::GradVarName("Out")); - const auto &index_type = framework::TransToProtoVarType(Ids->dtype()); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; + const auto &index_type = Ids->dtype(); + bool index_type_match = index_type == phi::DataType::INT32 || + index_type == phi::DataType::INT64; PADDLE_ENFORCE_EQ( index_type_match, true, platform::errors::InvalidArgument( "scatter_op index holds the wrong type, it holds [%s]," "but desires to be [%s] or [%s]", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); + index_type, phi::DataType::INT32, phi::DataType::INT64)); + auto &dev_ctx = ctx.template device_context(); if (dX) { framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == framework::proto::VarType::INT32) { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); } else { - CPUScatterGradForX(ctx.device_context(), *Ids, dX); + phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); } } if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == framework::proto::VarType::INT32) { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + if (index_type == phi::DataType::INT32) { + phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); } else { - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); } } } diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc index 0a4cab5fac1ab..93f2d60e5f232 100644 --- a/paddle/fluid/operators/scatter_test.cc +++ b/paddle/fluid/operators/scatter_test.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" #include @@ -43,7 +43,7 @@ TEST(scatter, ScatterUpdate) { auto* cpu_place = new paddle::platform::CPUPlace(); paddle::platform::CPUDeviceContext ctx(*cpu_place); - paddle::operators::ScatterAssign(ctx, src, index, &output); + phi::funcs::ScatterAssign(ctx, src, index, &output); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output.data()[i], 0.0f); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu index 4e20844dc3275..e147e62a98354 100644 --- a/paddle/fluid/operators/segment_pool_op.cu +++ b/paddle/fluid/operators/segment_pool_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/segment_pool_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 2d4730635fd2a..25c12ab565a14 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -16,8 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h index 365381abc4683..2960b77d5ac0f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.h @@ -15,8 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather.h" -#include "paddle/fluid/operators/scatter.h" +#include "paddle/phi/kernels/funcs/scatter.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu index 3c546dd8156a2..68628fb2748c4 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -11,8 +11,8 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_functor.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/gather.cu.h" #include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/phi/kernels/funcs/gather.cu.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -62,10 +62,11 @@ int64_t ComputeBlockSize(int64_t col) { template