From c52a664e86a53c77a3ee33400edb49de36d81f4e Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Mon, 7 Mar 2022 14:33:58 +0800 Subject: [PATCH 01/50] [Phi]Move elementwise_div grad/double grad Kernel to Phi (#40172) * move elementwise_div grad * change mutable_data to alloc * fix compile bugs --- .../new_executor/standalone_executor_test.cc | 2 +- .../elementwise/elementwise_div_op.cc | 36 --- .../elementwise/elementwise_div_op.cu | 96 -------- .../elementwise/elementwise_div_op.h | 211 ------------------ .../elementwise/elementwise_functor.h | 61 ----- .../elementwise/elementwise_op_function.h | 71 +----- .../test_elementwise_div_grad_grad.cc | 2 +- .../kernels/cpu/elementwise_grad_kernel.cc | 37 ++- paddle/phi/kernels/elementwise_grad_kernel.h | 21 ++ paddle/phi/kernels/funcs/broadcast_function.h | 20 ++ .../phi/kernels/funcs/elementwise_functor.h | 68 ++++++ .../phi/kernels/funcs/elementwise_grad_base.h | 27 +++ paddle/phi/kernels/gpu/elementwise_grad.h | 126 +++++++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 62 ++++- .../impl/elementwise_grad_kernel_impl.h | 156 +++++++++++++ paddle/phi/kernels/math_kernel.cc | 1 + paddle/phi/ops/compat/elementwise_sig.cc | 22 ++ 17 files changed, 547 insertions(+), 472 deletions(-) delete mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.cu diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 2c3359ffa8e46..62d87b6917e40 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -54,7 +54,7 @@ USE_OP(slice_grad); USE_OP(lookup_table_grad); USE_OP(sqrt); USE_OP(elementwise_max); -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); USE_OP(sgd); USE_OP(squared_l2_norm); USE_OP(memcpy_h2d); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 38cd232e4d1d2..13fd9b81a8765 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -102,42 +102,6 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, ops::ElementwiseDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); - -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu deleted file mode 100644 index 9eb4b0352e533..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = ctx.template device_context(); - const auto place = ctx.GetPlace(); - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, out, y}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, DivGradXFunctor()); - } else if (dy != nullptr && dx == nullptr) { - std::vector ins = {dout, out, y}; - GetGradXOrYOut( - dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor()); - } -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index c58a7f36548a5..e9adb9abdb528 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -20,142 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -void default_elementwise_sub(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseSubFunctor(), z); - } -} - -template -void default_elementwise_div(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseDivFunctor(), z); - } -} - -template -class ElementwiseDivKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); - auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); - auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); - phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); - } -}; - -template -struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } -}; - -template -struct DivGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template -struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * out / y; - } -}; - -template -struct DivGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex out_div_y_conj((out / y).real, - -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template -struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; - } -}; - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - - ElemwiseGradCompute, DivGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseDivGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseDivGrad(ctx, x, y, out, dout, dx, dy); - } -}; - class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { } }; -template -class ElementwiseDivDoubleGradKernel : public framework::OpKernel { - using Tensor = framework::Tensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Input("Out"); - auto* ddX = ctx.Input("DDX"); - auto* ddY = ctx.Input("DDY"); - auto* dX = ctx.Input("DX"); - - auto* dY = ctx.Output(framework::GradVarName("Y")); - auto* dOut = ctx.Output("DOut"); - auto* ddOut = ctx.Output("DDOut"); - - int axis = ctx.Attr("axis"); - - if (dY) dY->mutable_data(Y->dims(), ctx.GetPlace()); - if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); - GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); - - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - // dY = Out * dX * ddY / Y - dX * ddX / Y - // dOut = - dX * ddY - // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can - // inplace ddx - Tensor tmp; - if (dOut) { - tmp = *dOut; - } else { - auto& dev_ctx = ctx.template device_context(); - tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); - } - if (dY) { - // dX_div_Y = dX / Y; - Tensor dX_div_Y = tmp; - default_elementwise_div(ctx, dX, Y, &dX_div_Y); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - ElemwiseGradCompute, DivDoubleDY>( - ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY, - DivGradDX(), DivDoubleDY()); - } - - if (ddOut) { - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); - default_elementwise_div(ctx, &tmp, Y, ddOut); - } - - if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 86f5be3071c2d..8e0bf78e9b7f9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -90,67 +90,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -template -struct DivGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - // dx = dout / y - // dy = - dout * out / y - phi::Array outs; - outs[0] = a / c; - outs[1] = -a * b / c; - return outs; - } -}; - -template -struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - Complex c_conj(c.real, -c.imag); - Complex out_div_c_conj((b / c).real, -(b / c).imag); - outs[0] = a / c_conj; - outs[1] = -a * out_div_c_conj; - return outs; - } -}; - -// Float div grad -template -struct DivGradXFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -// Complex div grad -template -struct DivGradXFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a / b_conj; - } -}; - -// Float mul and div -template -struct DivGradYFunctor { - inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { - return -a * b / c; - } -}; - -// Complex mul and div -template -struct DivGradYFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b, - const Complex c) const { - Complex out_div_c_conj((b / c).real, -(b / c).imag); - return -a * out_div_c_conj; - } -}; - // Fmax template struct FMaxFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 61862aa9f8740..80b07721f0b4d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -45,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif @@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim &x_dim = x.dims(); - const framework::DDim &y_dim = y.dims(); const auto &dev_ctx = ctx.template device_context(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - phi::funcs::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } + phi::funcs::ElemwiseGradCompute( + dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } // It is a common implementation to compute binary calculation with the support @@ -1174,14 +1167,6 @@ static inline std::vector GetReduceDim(const framework::DDim &in, } #if defined(__NVCC__) || defined(__HIPCC__) -template -void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, - framework::Tensor *src, framework::Tensor *dst) { - std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceImpl>( - dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, - dev_ctx.stream()); -} template void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, @@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy, Functor func) { - framework::Tensor tmp_dx; - framework::Tensor tmp_dy; - dx->mutable_data(place); - dy->mutable_data(place); - std::vector outs; - if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) { - outs = {dx, dy}; - } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, dy}; - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - outs = {dx, &tmp_dy}; - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, &tmp_dy}; - } - - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); - - if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } + phi::GetGradXAndYOut(dev_ctx, place, axis, ins, *dout, dx, dy, + func); } template @@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dxy, Functor func) { - framework::Tensor tmp_dxy; - dxy->mutable_data(place); - - std::vector outs; - if (dxy->dims() != dout->dims()) { - tmp_dxy.mutable_data(dout->dims(), place); - outs = {&tmp_dxy}; - } else { - outs = {dxy}; - } - - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); - if (dxy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); - } + phi::GetGradXOrYOut(dev_ctx, place, axis, ins, *dout, dxy, + func); } #endif diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 9aa206efed8c0..7890d634e9941 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -28,7 +28,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); namespace paddle { namespace operators { diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc index e48ee80595908..c9177f1c46eac 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc @@ -18,7 +18,6 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/cpu/elementwise_grad.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -108,6 +107,20 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + funcs::ElementwiseGradPreProcess(dout, dx); + phi::funcs::ElemwiseGradCompute, DivGradDY>( + dev_ctx, x, y, out, dout, axis, dx, dy, DivGradDX(), DivGradDY()); +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -171,3 +184,25 @@ PD_REGISTER_KERNEL(subtract_double_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_grad, + CPU, + ALL_LAYOUT, + phi::DivideGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} + +PD_REGISTER_KERNEL(divide_double_grad, + CPU, + ALL_LAYOUT, + phi::DivideDoubleGradKernel, + float, + double, + int, + int64_t, + paddle::platform::complex, + paddle::platform::complex) {} diff --git a/paddle/phi/kernels/elementwise_grad_kernel.h b/paddle/phi/kernels/elementwise_grad_kernel.h index a1b296e326f21..bcd5a98f07ee9 100644 --- a/paddle/phi/kernels/elementwise_grad_kernel.h +++ b/paddle/phi/kernels/elementwise_grad_kernel.h @@ -64,4 +64,25 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, int axis, DenseTensor* ddout); +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy); + +template +void DivideDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dx, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dy, + DenseTensor* dout, + DenseTensor* ddout); } // namespace phi diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index aab31cfbd55b6..7634c2462738b 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -592,5 +592,25 @@ void ElementwiseCompute(const GPUContext &dev_ctx, #endif +template +void DefaultElementwiseOperator(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + DenseTensor *z, + int axis = -1) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + dev_ctx.template Alloc(z); + if (x_dims.size() >= y_dims.size()) { + funcs::ElementwiseCompute(dev_ctx, x, y, axis, Functor(), z); + } else { + funcs::ElementwiseCompute( + dev_ctx, x, y, axis, InverseFunctor(), z); + } +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index c0a3985cd1713..5615a450b5c54 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/hostdevice.h" @@ -92,5 +93,72 @@ struct InverseDivideFunctor { inline HOSTDEVICE T operator()(const T a, const T b) const { return b / a; } }; +template +using ComplexType = phi::dtype::complex; + +template +struct DivGradXYFunctor { + inline HOSTDEVICE phi::Array operator()(const InT a, + const InT b, + const InT c) { + // dx = dout / y + // dy = - dout * out / y + phi::Array outs; + outs[0] = a / c; + outs[1] = -a * b / c; + return outs; + } +}; + +template +struct DivGradXYFunctor, ComplexType> { + inline HOSTDEVICE phi::Array, 2> operator()( + const ComplexType a, + const ComplexType b, + const ComplexType c) { + phi::Array, 2> outs; + ComplexType c_conj(c.real, -c.imag); + ComplexType out_div_c_conj((b / c).real, -(b / c).imag); + outs[0] = a / c_conj; + outs[1] = -a * out_div_c_conj; + return outs; + } +}; + +// Float div grad +template +struct DivGradXFunctor { + inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } +}; + +// ComplexType div grad +template +struct DivGradXFunctor> { + inline HOSTDEVICE ComplexType operator()(const ComplexType a, + const ComplexType b) const { + ComplexType b_conj(b.real, -b.imag); + return a / b_conj; + } +}; + +// Float mul and div +template +struct DivGradYFunctor { + inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { + return -a * b / c; + } +}; + +// ComplexType mul and div +template +struct DivGradYFunctor> { + inline HOSTDEVICE ComplexType operator()(const ComplexType a, + const ComplexType b, + const ComplexType c) const { + ComplexType out_div_c_conj((b / c).real, -(b / c).imag); + return -a * out_div_c_conj; + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index dff0cfe5b8b90..17bf873587381 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -24,6 +24,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" #endif @@ -1758,5 +1759,31 @@ void ElemwiseGradComputeWithBroadcast(const GPUContext &ctx, #endif +template +void ElemwiseGradCompute(const DeviceContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + int axis, + DenseTensor *dx, + DenseTensor *dy, + DX_OP dx_op, + DY_OP dy_op) { + const DDim &x_dim = x.dims(); + const DDim &y_dim = y.dims(); + if (x.dims() == y.dims()) { + ElemwiseGradComputeNoBroadcast( + dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } else { + ElemwiseGradComputeWithBroadcast( + dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); + } +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index 20799f4e37b3b..b356f19555fc4 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -14,12 +14,101 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/place.h" #include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/elementwise_grad_base.h" #include "paddle/phi/kernels/funcs/reduce_function.h" namespace phi { +template +void ReduceWrapper(const GPUContext &dev_ctx, + int axis, + DenseTensor *src, + DenseTensor *dst) { + std::vector reduce_dims = + funcs::GetReduceDim(dst->dims(), src->dims(), axis); + funcs::TensorReduceImpl>( + dev_ctx, + *src, + dst, + kps::IdentityFunctor(), + reduce_dims, + dev_ctx.stream()); +} + +template +void GetGradXAndYOut(const GPUContext &dev_ctx, + const Place &place, + int axis, + std::vector ins, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + Functor func) { + DenseTensor tmp_dx; + DenseTensor tmp_dy; + dev_ctx.Alloc(dx); + dev_ctx.Alloc(dy); + std::vector outs; + if (dx->dims() == dout.dims() && dy->dims() == dout.dims()) { + outs = {dx, dy}; + } else if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) { + tmp_dx.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dx); + outs = {&tmp_dx, dy}; + } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) { + tmp_dy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dy); + outs = {dx, &tmp_dy}; + } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) { + tmp_dy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dy); + tmp_dx.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dx); + outs = {&tmp_dx, &tmp_dy}; + } + + funcs::BroadcastKernel( + dev_ctx, ins, &outs, axis, func); + + if (dx->dims() != dout.dims() && dy->dims() == dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); + } else if (dx->dims() == dout.dims() && dy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); + } else if (dx->dims() != dout.dims() && dy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); + ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); + } +} + +template +void GetGradXOrYOut(const GPUContext &dev_ctx, + const Place &place, + int axis, + std::vector ins, + const DenseTensor &dout, + DenseTensor *dxy, + Functor func) { + DenseTensor tmp_dxy; + dev_ctx.Alloc(dxy); + + std::vector outs; + if (dxy->dims() != dout.dims()) { + tmp_dxy.Resize(dout.dims()); + dev_ctx.Alloc(&tmp_dxy); + outs = {&tmp_dxy}; + } else { + outs = {dxy}; + } + + funcs::BroadcastKernel(dev_ctx, ins, &outs, axis, func); + if (dxy->dims() != dout.dims()) { + ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); + } +} + /* ****************************** Add Grad @@ -243,4 +332,41 @@ void elementwise_sub_grad(const GPUContext &ctx, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); } +/* +****************************** + Div Grad +****************************** +*/ +template +void ElementwiseDivGrad(const GPUContext &dev_ctx, + const DenseTensor &x, + const DenseTensor &y, + const DenseTensor &out, + const DenseTensor &dout, + DenseTensor *dx, + DenseTensor *dy, + int axis = -1) { + const auto place = dev_ctx.GetPlace(); + if (dx != nullptr && dy != nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXAndYOut( + dev_ctx, + place, + axis, + ins, + dout, + dx, + dy, + funcs::DivGradXYFunctor()); + } else if (dx != nullptr && dy == nullptr) { + std::vector ins = {&dout, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor()); + } else if (dy != nullptr && dx == nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor()); + } +} + } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index d00888aee6701..45c8b9a21639f 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -15,9 +15,11 @@ #include "paddle/phi/kernels/elementwise_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/complex.h" +#include "paddle/phi/common/float16.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/elementwise_base.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" #include "paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h" @@ -102,6 +104,38 @@ void SubtractDoubleGradKernel(const Context& dev_ctx, phi::SubtractDoubleGradImpl(dev_ctx, y, ddx, ddy, dout, axis, ddout); } +template +void DivideGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx, + DenseTensor* dy) { + const auto place = dev_ctx.GetPlace(); + if (dx != nullptr && dy != nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXAndYOut( + dev_ctx, + place, + axis, + ins, + dout, + dx, + dy, + funcs::DivGradXYFunctor()); + } else if (dx != nullptr && dy == nullptr) { + std::vector ins = {&dout, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dx, funcs::DivGradXFunctor()); + } else if (dy != nullptr && dx == nullptr) { + std::vector ins = {&dout, &out, &y}; + GetGradXOrYOut( + dev_ctx, place, axis, ins, dout, dy, funcs::DivGradYFunctor()); + } +} + } // namespace phi PD_REGISTER_KERNEL(add_grad, @@ -168,3 +202,29 @@ PD_REGISTER_KERNEL(subtract_double_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_grad, + GPU, + ALL_LAYOUT, + phi::DivideGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} + +PD_REGISTER_KERNEL(divide_double_grad, + GPU, + ALL_LAYOUT, + phi::DivideDoubleGradKernel, + float, + phi::dtype::float16, + phi::dtype::bfloat16, + double, + int, + int64_t, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index ac7d6fd1a0e9c..e8831f90213b6 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -14,8 +14,11 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/complex.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/elementwise_functor.h" namespace phi { @@ -103,4 +106,157 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, } } +/* +****************************** + Divide Grad +****************************** +*/ + +template +struct DivGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } +}; + +template +struct DivGradDX> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex y_conj(y.real, -y.imag); + return dout / y_conj; + } +}; + +template +struct DivGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return -dout * out / y; + } +}; + +template +struct DivGradDY> { + HOSTDEVICE phi::dtype::complex operator()( + phi::dtype::complex x, + phi::dtype::complex y, + phi::dtype::complex out, + phi::dtype::complex dout) const { + phi::dtype::complex out_div_y_conj((out / y).real, -(out / y).imag); + return -dout * out_div_y_conj; + } +}; + +template +struct DivDoubleDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return y * out * dout - x * dout; + } +}; + +template +void DivideDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& y, + const DenseTensor& out, + const DenseTensor& dx, + paddle::optional ddx, + paddle::optional ddy, + int axis, + DenseTensor* dy, + DenseTensor* dout, + DenseTensor* ddout) { + if (dy) { + dy->Resize(y.dims()); + dev_ctx.template Alloc(dy); + } + if (dout) { + dout->Resize(out.dims()); + dev_ctx.template Alloc(dout); + } + if (ddout) { + ddout->Resize(out.dims()); + dev_ctx.template Alloc(ddout); + } + // ddX_safe == null ? 0 : ddX + // ddY_safe == null ? 0 : ddY + DenseTensor ddX_safe, ddY_safe; + phi::funcs::GetDoubleGradSafeTensor( + dev_ctx, dx, ddx.get_ptr(), &ddX_safe); + phi::funcs::GetDoubleGradSafeTensor( + dev_ctx, y, ddy.get_ptr(), &ddY_safe); + + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + // dY = Out * dX * ddY / Y - dX * ddX / Y + // dOut = - dX * ddY + // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can + // inplace ddx + DenseTensor tmp; + if (dout) { + tmp = *dout; + } else { + tmp.Resize(out.dims()); + dev_ctx.template Alloc(&tmp); + } + if (dy) { + // dX_div_Y = dX / Y; + DenseTensor dX_div_Y = tmp; + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, dx, y, &dX_div_Y, axis); + + // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the + // first output tensor is nullptr, the branch to calculate first + // output tensor will not be activated, DivGradDx function will not + // be called and can be ignored, the first branch has little effect + // on running speed. + + // dY = Out * dX * ddY / Y - dX * ddX / Y + phi::funcs::ElemwiseGradCompute, DivDoubleDY>( + dev_ctx, + ddX_safe, + ddY_safe, + out, + dX_div_Y, + axis, + nullptr, + dy, + DivGradDX(), + DivDoubleDY()); + } + + if (ddout) { + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, out, ddY_safe, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseSubtractFunctor>( + dev_ctx, ddX_safe, tmp, &tmp, axis); + funcs::DefaultElementwiseOperator, + funcs::InverseDivideFunctor>( + dev_ctx, tmp, y, ddout, axis); + } + + if (dout) { + // dOut = - dX * ddY + funcs::DefaultElementwiseOperator, + funcs::InverseMultiplyFunctor>( + dev_ctx, dx, ddY_safe, dout, axis); + auto& place = *dev_ctx.eigen_device(); + auto dout_result = phi::EigenVector::Flatten(*dout); + dout_result.device(place) = static_cast(-1) * dout_result; + } +} + } // namespace phi diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index 8b17d8bd2506c..a5d3f51e5447f 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -208,6 +208,7 @@ PD_REGISTER_KERNEL(divide, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(multiply, diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc index 89846ea0563bb..d4a25866907a0 100644 --- a/paddle/phi/ops/compat/elementwise_sig.cc +++ b/paddle/phi/ops/compat/elementwise_sig.cc @@ -106,6 +106,22 @@ KernelSignature ElementwiseSubDoubleGradOpArgumentMapping( "subtract_double_grad", {"Y", "DDX", "DDY", "DOut"}, {"axis"}, {"DDOut"}); } +KernelSignature ElementwiseDivGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("divide_grad", + {"X", "Y", "Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X"), GradVarName("Y")}); +} + +KernelSignature ElementwiseDivDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("divide_double_grad", + {"Y", "Out", "DX", "DDX", "DDY"}, + {"axis"}, + {GradVarName("Y"), "DOut", "DDOut"}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add); @@ -117,6 +133,8 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad); PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad_grad, subtract_double_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad, divide_grad); +PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad); PD_REGISTER_ARG_MAPPING_FN(elementwise_add, phi::ElementwiseAddOpArgumentMapping); @@ -136,3 +154,7 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad, phi::ElementwiseSubGradOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad_grad, phi::ElementwiseSubDoubleGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad, + phi::ElementwiseDivGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(elementwise_div_grad_grad, + phi::ElementwiseDivDoubleGradOpArgumentMapping); From 6fd96a0400e5e618795ad20f8e85a2e975ea4194 Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 7 Mar 2022 15:41:27 +0800 Subject: [PATCH 02/50] Add mlir trt engine type. (#40197) * infrt add trt engine * update engine name --- .../backends/tensorrt/test_trt_engine.cc | 8 ++--- paddle/infrt/backends/tensorrt/trt_engine.cc | 26 ++++++++--------- paddle/infrt/backends/tensorrt/trt_engine.h | 11 +++++-- paddle/infrt/backends/tensorrt/trt_utils.h | 9 +++--- .../dialect/tensorrt/trt_dilaect_types.h | 29 +++++++++++++++++++ paddle/infrt/dialect/tensorrt/trt_op_base.td | 3 ++ paddle/infrt/dialect/tensorrt/trt_ops.cc | 25 ++++++++++++++++ paddle/infrt/dialect/tensorrt/trt_ops.h | 5 +++- 8 files changed, 91 insertions(+), 25 deletions(-) create mode 100644 paddle/infrt/dialect/tensorrt/trt_dilaect_types.h diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc index 54b7bc3e8af83..12cf14060e27c 100644 --- a/paddle/infrt/backends/tensorrt/test_trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -17,8 +17,8 @@ #include #include #include -#include "glog/logging.h" -#include "gtest/gtest.h" +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" @@ -86,7 +86,7 @@ TrtUniquePtr ConstructNetwork( inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } TEST(trt, run_static) { - TRTEngine static_trt_engine(0); + TrtEngine static_trt_engine(0); auto net = ConstructNetwork( static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); BuildOptions static_build_options; @@ -164,7 +164,7 @@ TEST(trt, run_static) { } TEST(trt, run_dynamic) { - TRTEngine engine(0); + TrtEngine engine(0); auto net = ConstructNetwork( engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false); BuildOptions build_options; diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc index a204fe42b4508..232653e8c41f7 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.cc +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -17,7 +17,7 @@ #include #include -#include "glog/logging.h" +#include #include "paddle/phi/backends/dynload/tensorrt.h" #include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/core/ddim.h" @@ -40,26 +40,26 @@ static nvinfer1::IRuntime* createInferRuntime( phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); } -TRTEngine::TRTEngine(int device_id) : device_id_(device_id) { +TrtEngine::TrtEngine(int device_id) : device_id_(device_id) { FreshDeviceId(); logger_.reset(new TrtLogger()); builder_.reset(createInferBuilder(logger_->GetTrtLogger())); phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); } -nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() { +nvinfer1::IBuilder* TrtEngine::GetTrtBuilder() { CHECK_NOTNULL(builder_); return builder_.get(); } -void TRTEngine::Build(TrtUniquePtr network, +void TrtEngine::Build(TrtUniquePtr network, const BuildOptions& build_options) { FreshDeviceId(); ModelToBuildEnv(std::move(network), build_options); CHECK_NOTNULL(engine_); } -bool TRTEngine::ModelToBuildEnv( +bool TrtEngine::ModelToBuildEnv( TrtUniquePtr network, const BuildOptions& build) { CHECK_NOTNULL(builder_); @@ -70,7 +70,7 @@ bool TRTEngine::ModelToBuildEnv( return true; } -bool TRTEngine::NetworkToEngine(const BuildOptions& build) { +bool TrtEngine::NetworkToEngine(const BuildOptions& build) { TrtUniquePtr config{builder_->createBuilderConfig()}; CHECK_NOTNULL(config); CHECK(SetupNetworkAndConfig(build, *network_, *config)); @@ -91,7 +91,7 @@ bool TRTEngine::NetworkToEngine(const BuildOptions& build) { return true; } -bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build, +bool TrtEngine::SetupNetworkAndConfig(const BuildOptions& build, INetworkDefinition& network, IBuilderConfig& config) { builder_->setMaxBatchSize(build.max_batch); @@ -235,7 +235,7 @@ bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build, return true; } -bool TRTEngine::SetUpInference( +bool TrtEngine::SetUpInference( const InferenceOptions& inference, const std::unordered_map& inputs, std::unordered_map* outputs) { @@ -261,7 +261,7 @@ bool TRTEngine::SetUpInference( return true; } -void TRTEngine::Run(const phi::GPUContext& ctx) { +void TrtEngine::Run(const phi::GPUContext& ctx) { if (is_dynamic_shape_) { DynamicRun(ctx); } else { @@ -269,7 +269,7 @@ void TRTEngine::Run(const phi::GPUContext& ctx) { } } -void TRTEngine::StaticRun(const phi::GPUContext& ctx) { +void TrtEngine::StaticRun(const phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -303,7 +303,7 @@ void TRTEngine::StaticRun(const phi::GPUContext& ctx) { runtime_batch, buffers.data(), ctx.stream(), nullptr); } -void TRTEngine::DynamicRun(const phi::GPUContext& ctx) { +void TrtEngine::DynamicRun(const phi::GPUContext& ctx) { const int num_bindings = engine_->getNbBindings(); std::vector buffers(num_bindings, nullptr); @@ -339,14 +339,14 @@ void TRTEngine::DynamicRun(const phi::GPUContext& ctx) { contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr); } -void TRTEngine::FreshDeviceId() { +void TrtEngine::FreshDeviceId() { int count; cudaGetDeviceCount(&count); CHECK_LT(device_id_, count); phi::backends::gpu::SetDeviceId(device_id_); } -void TRTEngine::GetEngineInfo() { +void TrtEngine::GetEngineInfo() { #if IS_TRT_VERSION_GE(8200) LOG(INFO) << "====== engine info ======"; std::unique_ptr infer_inspector( diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h index f72bdaf3ac0b4..3c8243e3c3838 100644 --- a/paddle/infrt/backends/tensorrt/trt_engine.h +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -56,13 +56,18 @@ using namespace nvinfer1; // NOLINT // // We have encapsulated this logic, please use the following programming model. // -// TRTEngine trt_engine; +// TrtEngine trt_engine; // trt_engine.Build(...); // trt_engine.SetUpInference(...); // trt_engine.Run(...); -class TRTEngine { +class TrtEngine { public: - explicit TRTEngine(int device_id); + explicit TrtEngine(int device_id = 0); + + TrtEngine(const TrtEngine&) = delete; + TrtEngine& operator=(const TrtEngine&) = delete; + TrtEngine(TrtEngine&&) = default; + TrtEngine& operator=(TrtEngine&&) = default; nvinfer1::IBuilder* GetTrtBuilder(); diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h index 4b129af1d5381..c66a850ffb1cc 100644 --- a/paddle/infrt/backends/tensorrt/trt_utils.h +++ b/paddle/infrt/backends/tensorrt/trt_utils.h @@ -15,16 +15,17 @@ #pragma once +#include +#include +#include +#include + #include #include #include #include #include -#include -#include -#include -#include "glog/logging.h" #include "paddle/phi/core/dense_tensor.h" namespace infrt { diff --git a/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h new file mode 100644 index 0000000000000..efcf7dd5be195 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_dilaect_types.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "mlir/IR/Types.h" + +namespace infrt { +namespace trt { + +class EngineType + : public mlir::Type::TypeBase { + public: + using Base::Base; +}; + +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_base.td b/paddle/infrt/dialect/tensorrt/trt_op_base.td index 5722f17d59787..128960ee03e03 100755 --- a/paddle/infrt/dialect/tensorrt/trt_op_base.td +++ b/paddle/infrt/dialect/tensorrt/trt_op_base.td @@ -27,6 +27,9 @@ class TRT_PaddleAttr : Attr()">, "PaddlePaddle " # description # " attribute">; +def TRT_EngineType : + Type()">, "!trt.engine">, + BuildableType<"getType<::infrt::trt::EngineType>()">; //===----------------------------------------------------------------------===// // PaddlePaddle type definitions diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.cc b/paddle/infrt/dialect/tensorrt/trt_ops.cc index 35b7967892caf..f179939e23206 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.cc +++ b/paddle/infrt/dialect/tensorrt/trt_ops.cc @@ -13,23 +13,48 @@ // limitations under the License. #include "paddle/infrt/dialect/tensorrt/trt_ops.h" +#include #include #include #include #include #include +#include "paddle/infrt/dialect/tensorrt/trt_dilaect_types.h" namespace infrt { namespace trt { TensorRTDialect::TensorRTDialect(mlir::MLIRContext *context) : mlir::Dialect("trt", context, mlir::TypeID::get()) { + addTypes(); addOperations< #define GET_OP_LIST #include "paddle/infrt/dialect/tensorrt/trt_ops.cpp.inc" // NOLINT >(); } +mlir::Type TensorRTDialect::parseType(mlir::DialectAsmParser &parser) const { + llvm::StringRef keyword; + if (parser.parseKeyword(&keyword)) return mlir::Type(); + // parse trt dilaect types, for example: !trt.engine + if (keyword == "engine") { + return infrt::trt::EngineType::get(getContext()); + } + parser.emitError(parser.getCurrentLocation(), "unknown infrt::trt type: ") + << keyword; + return mlir::Type(); +} + +void TensorRTDialect::printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const { + // print trt dilaect types, for example: !trt.engien + if (type.isa()) { + printer << "engine"; + return; + } + llvm_unreachable("unknown infrt::trt type."); +} + } // namespace trt } // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.h b/paddle/infrt/dialect/tensorrt/trt_ops.h index 95b2ed41fdfe9..978b9906e5f52 100644 --- a/paddle/infrt/dialect/tensorrt/trt_ops.h +++ b/paddle/infrt/dialect/tensorrt/trt_ops.h @@ -35,8 +35,11 @@ namespace trt { class TensorRTDialect : public mlir::Dialect { public: - explicit TensorRTDialect(mlir::MLIRContext* context); + explicit TensorRTDialect(mlir::MLIRContext *context); static llvm::StringRef getDialectNamespace() { return "trt"; } + mlir::Type parseType(mlir::DialectAsmParser &parser) const; // NOLINT + void printType(mlir::Type type, + mlir::DialectAsmPrinter &printer) const; // NOLINT }; } // namespace trt From 7296433504eae988cadf198bd9e4aaccde73d8aa Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Mon, 7 Mar 2022 16:33:59 +0800 Subject: [PATCH 03/50] [phi] move is_empty to phi (#39919) * Add is_empty * fixed for CI * fixed code style * resolve conflict * deal with comments * replace pt by pd --- paddle/fluid/operators/is_empty_op.cc | 20 ++++----- paddle/fluid/operators/is_empty_op.cu.cc | 23 ---------- paddle/phi/infermeta/unary.cc | 6 +++ paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/is_empty_kernel.cc | 53 ++++++++++++++++++++++++ paddle/phi/kernels/is_empty_kernel.h | 24 +++++++++++ 6 files changed, 92 insertions(+), 36 deletions(-) delete mode 100644 paddle/fluid/operators/is_empty_op.cu.cc create mode 100644 paddle/phi/kernels/is_empty_kernel.cc create mode 100644 paddle/phi/kernels/is_empty_kernel.h diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 2750367dc7739..c835bb3cf60bf 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty"); - ctx->SetOutputDim("Out", {1}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto *x = ctx.Input("X"); @@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor, + PD_INFER_META(phi::IsEmptyInferMeta)); REGISTER_OPERATOR( is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); + paddle::framework::EmptyGradOpMaker, + IsEmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc deleted file mode 100644 index 3c256503baf6b..0000000000000 --- a/paddle/fluid/operators/is_empty_op.cu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 85db1547f16cc..b9eb5196b1e8f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include + #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/enforce.h" @@ -307,6 +308,11 @@ void InferMetaFromVecValue(const MetaTensor& x, } } +void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(phi::make_ddim({1})); + out->set_dtype(DataType::BOOL); +} + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index d4e21fbd8244b..37b17f6e3d182 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -69,6 +69,8 @@ void InferMetaFromVecValue(const MetaTensor& x, const std::vector& shape, MetaTensor* out); +void IsEmptyInferMeta(const MetaTensor& x, MetaTensor* out); + void MultinomialInferMeta(const MetaTensor& x, int num_samples, bool replacement, diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc new file mode 100644 index 0000000000000..26c2f978005f2 --- /dev/null +++ b/paddle/phi/kernels/is_empty_kernel.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/is_empty_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void IsEmptyKernel(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + // Note: is_empty is always executed on CPU and the output data should + // always be allocated for CPUPlace. We reigister CUDA kernel for this op to + // avoid the unnecessary data transform. + bool* out_data = dev_ctx.template HostAlloc(out); + out_data[0] = phi::product(x.dims()) == 0; +} + +} // namespace phi + +PD_REGISTER_KERNEL(is_empty, + CPU, + ALL_LAYOUT, + phi::IsEmptyKernel, + float, + double, + int, + int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(is_empty, + GPU, + ALL_LAYOUT, + phi::IsEmptyKernel, + float, + double, + int, + int64_t) {} +#endif diff --git a/paddle/phi/kernels/is_empty_kernel.h b/paddle/phi/kernels/is_empty_kernel.h new file mode 100644 index 0000000000000..3bcf6f9054ed5 --- /dev/null +++ b/paddle/phi/kernels/is_empty_kernel.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IsEmptyKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace phi From 2a3d9eca64b0312a6bf49ffe6f470a084886bbe4 Mon Sep 17 00:00:00 2001 From: Ming-Xu Huang Date: Mon, 7 Mar 2022 16:38:21 +0800 Subject: [PATCH 04/50] cuBlasLt Epilogue To Fuse Linear + ReLU|GeLU (#39437) * Added cuBlasLtHandle_t to device context. * Added fused_gemm_epilogue op. 1. Added fused_gemm_epilogue op to leverage cuBlastLt Epilogue. 2. Support fusion Act(X*Y + bias), X'dims >=2 and Y'dims shoule be 2. 2. Act currently only be supported ReLU. (Will add GeLU in the future). * Added UT to fused_gemm_epilogue op. * Added LinearAct Pattern 1. Added LinearAct into graph_pattern_detector.* to define (2.)'s pattern. 2. LinearAct is used to detect act(element_add(matmul_v2(x, w), bias)). 3. act currently only support ReLU (Will support GeLU in the future). * Added FuseGemmEpiloguePass 1, Added FuseGemmEpiloguePass to handle nn.Linear + Act{ReLU} fusion (GeLU will be supported in the future). 2. Only support matmul_v2 from nn.Linear. * Added pybind to BuildStrageter.fuse_gemm_epilogue_. * Added UT for fuse_gemm_epilogue_pass. * GeLU support and EpilogueSingleton 1. Added GeLU support to fused_gemm_epilogue op. 2. Added EpilogueSingleton to cache auxiliary pointer. 3. Added related UTs. * Rename cublaslt_epilogue_opto gemm_epilogue_op.*. * Added both train and infer pattern to LinearAct. 1. Added support of fwd graph with grap_ops linking to LinearAct. 2. Added related changes to fuse_gemm_epilogue_pass for above modification. * Changed CUDA requirement from 11.4 to 11.6 for fuse_gemm_epilogue_pass. * Added identity activation support to gemm_epilogue_op. * Added Linear Fusion (matmul_v2 + ele_add) 1. Added matmul_v2 + ele_add pattern to LinearActPattern. 2. Added matmul_v2 + ele_add support to fuse_gemm_epilogue_pass. * Rename gemm_epilogue_op.* to fused_gemm_epilogue_op.* * Add fused_gemm_epilogue_grad op. 1. Added fused_gemm_epilogue_grad to support backward epilogue fusion. * Add UTs to fused_gemm_epilogue_grad_op. * Change attribute name in fused_gemm_epilogue_grad_op for clearing. * Allow DX and DBias be dispensable to fused_gemm_epilogue_grad op. * Added ElementwiseAdd+Matmul+Act graph pattern detection. * Fuse backward of Linear( Act(x)) 1. Added backward fusion pass to Linear( Act(x)). 2. Added backward fusion pass to Linear(x). * Added UTs to backward fusion of Linear(Act(x)). * Complete document of arguments to fused_gemm_epilogue_op. * Made arguments of some functions pass by reference. * Modify code with review comments. 1. Made arguments of some function pass by reference. 2. Removed redundant code. 3. Followed Google code style to change code. * Made 'const' code style be consistent * Fixed random seed of python UTs. * Set Compiling constrains to cuBlasLt 1. Require CUDA 11.6+ 2. Remove fuse_gemm_epilogue related tests when CUDA < 11.6. * Code Reivew from Paddle 1. Changed arguments name is_first_gemm to without_x_gradient for clearing. 2. Applied PADDLE_THROW in fused_gemm_epilogue_op. * Remove EpilogueSingleton 1. Applied ReserveSpace to replace Epilogue for passing auxiliary pointers between FWD and BWD. * Fix a logical error and enhance UTs. 1. Added act op count checking in UTs. 2. Fix issue to fuse backward or ReLU(Linear(X)). 3. TODO: solve GELU fusion issues. * Fix Linear and GeLU fusion issues. 1. Modified graph_detech_pattern to fit with both linear wiht gelu or relu. 2. Modified data range in Uts to allow negative values. * Removed fused_gemm_epilogue_op.h. * Rename namespace pten to phi. * Rename name of arguments in fused_gemm_epilogue_op 1. bias -> Bias. 2. out -> Out. 3. reserve_space -> ReserveSpace. * Change EpiloguePassActivationCache as local variable. 1. Removed singleton in EpiloguePassActivationCache. 2. Made EpiloguePassActivationCache as an argument to each pass functions. --- cmake/operators.cmake | 10 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 9 + .../fluid/framework/details/build_strategy.h | 3 + paddle/fluid/framework/ir/CMakeLists.txt | 1 + .../framework/ir/fuse_gemm_epilogue_pass.cc | 471 ++++++++++++++++++ .../framework/ir/fuse_gemm_epilogue_pass.h | 100 ++++ .../framework/ir/graph_pattern_detector.cc | 178 ++++++- .../framework/ir/graph_pattern_detector.h | 59 +++ paddle/fluid/operators/fused/CMakeLists.txt | 7 +- .../operators/fused/fused_gemm_epilogue_op.cc | 353 +++++++++++++ .../operators/fused/fused_gemm_epilogue_op.cu | 376 ++++++++++++++ .../platform/device/gpu/cuda/cuda_helper.h | 24 + paddle/fluid/platform/device/gpu/gpu_types.h | 6 + paddle/fluid/platform/device_context.cc | 22 + paddle/fluid/platform/device_context.h | 31 ++ paddle/fluid/pybind/pybind.cc | 26 + paddle/phi/backends/gpu/forwards.h | 4 + paddle/phi/backends/gpu/gpu_context.cc | 32 ++ paddle/phi/backends/gpu/gpu_context.h | 6 + paddle/phi/backends/gpu/gpu_decls.h | 5 + .../fluid/tests/unittests/CMakeLists.txt | 11 + .../unittests/test_fuse_gemm_epilogue_pass.py | 392 +++++++++++++++ .../test_fused_gemm_epilogue_grad_op.py | 239 +++++++++ .../unittests/test_fused_gemm_epilogue_op.py | 450 +++++++++++++++++ tools/static_mode_white_list.py | 3 + 26 files changed, 2788 insertions(+), 32 deletions(-) create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc create mode 100644 paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7affd59de162d..9e8c81c2985b7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -293,11 +293,11 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") - - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 66dfb81755f1c..948eaab40b4f6 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c99200ec98aa8..fdf74d2f769fc 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + AppendPassWithCheck(strategy_.fuse_gemm_epilogue_, + "fuse_gemm_epilogue_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. @@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass); !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) +USE_PASS(fuse_gemm_epilogue_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 70a083dd70bc3..5eb584aaefa98 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -1,4 +1,5 @@ // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -124,6 +125,8 @@ struct BuildStrategy { paddle::optional fuse_broadcast_ops_{paddle::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // Fuse GEMM+Epilogue via cublasLt epilogue. + bool fuse_gemm_epilogue_{false}; // mkldnn_enabled_op_types specify the operator type list to // use MKLDNN acceleration. It is null in default, means diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0d53a54ff822a..a1f2d6edca6a2 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -157,6 +157,7 @@ endif() cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc new file mode 100644 index 0000000000000..f48224cbdc24f --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { + EpiloguePassActivationCache cache; + + graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache); + graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache); + graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache); + graph = FuseLinearFwd(graph, false); + graph = FuseLinearFwd(graph, true); + graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache); + graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache); + graph = FuseLinearBwd(graph, false); + graph = FuseLinearBwd(graph, true); +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, + bool is_training) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, {}, is_training, false); + + int found_linear_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + std::string activation = "none"; + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, ele_out); + + GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name(); + found_linear_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( + ir::Graph *graph, const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act); + + int found_linear_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + auto activation = act_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, act_out); + + // Only need to check weight.shape[1] for auxiliary pointer + // and mark it the act op is fused for backward epilogue fusion. + // That because cuBlasLt epilogue's restriction. + if (is_training) { + int divisor_of_n = activation == "relu" ? 128 : 8; + if (matmul_w_shape[1] % divisor_of_n) return; + + VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace")); + auto *reserve_space_node = g->CreateVarNode(&reserve_space); + + cache->InsertFusedActivation( + GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()), + reserve_space_node); + + gemm_epilogue_node->Op()->SetOutput("ReserveSpace", + {reserve_space_node->Name()}); + + if (!is_act_grad_x_from_act) { + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern); + act_grad_op->Op()->RenameInput(ele_out->Name(), + reserve_space_node->Name()); + IR_NODE_LINK_TO(reserve_space_node, act_grad_op); + } + IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node); + } + + GraphSafeRemoveNodes(g, + {matmul_op, matmul_out, ele_add_op, ele_out, act_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> " + << act_out->Name(); + found_linear_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, + bool without_x_gradient) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + + Node *matmul_grad_dx = nullptr; + if (!without_x_gradient) { + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx, + ele_add_matmul_act_pattern); + matmul_grad_dx = matmul_grad_dx_ptr; + } + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + std::string activation_grad = "none"; + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + if (matmul_grad_dx) { + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", + {matmul_grad_dx->Name()}); + } + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + if (matmul_grad_dx) { + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx); + } + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op}); + + std::string matmul_grad_dx_name = + matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " "; + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_w->Name() << " and " << matmul_grad_dx_name; + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, act_grad_types, false, + is_act_grad_x_from_act); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx, + ele_add_matmul_act_pattern); + + auto key = + GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId()); + if (!cache->HasFusedActivation(key)) { + return; + } + auto *reserve_space_node = cache->GetFusedActivationSpace(key); + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + auto activation_grad = act_grad_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace", + {reserve_space_node->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node); + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op, + matmul_grad_dx, act_grad_op}); + + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name() + << "\n\t " << matmul_grad_dx->Name() << " -> " + << act_grad_op->Name() << " -> " << act_grad_dx->Name(); + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +bool FuseGemmEpiloguePass::IsGemmFromLinear_( + const std::vector &x_shape, const std::vector &w_shape, + OpDesc *matmul_v2_op) const { + if (w_shape.size() != 2 || x_shape.size() < 2) return false; + for (auto attr_name : + {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y", + "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) { + if (matmul_v2_op->HasAttr(attr_name)) { + std::vector tmp_vec = + BOOST_GET_CONST(std::vector, matmul_v2_op->GetAttr(attr_name)); + if (tmp_vec.size() > 0) return false; + } + } + if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || + BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) + return false; + + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_gemm_epilogue_pass, + paddle::framework::ir::FuseGemmEpiloguePass); diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h new file mode 100644 index 0000000000000..575ffee73d60e --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class Graph; +class Node; + +class EpiloguePassActivationCache { + public: + EpiloguePassActivationCache() {} + + EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete; + void operator=(const EpiloguePassActivationCache &) = delete; + + bool HasFusedActivation(const std::string &key) const { + return fused_activation_space_map_.count(key); + } + + ir::Node *GetFusedActivationSpace(const std::string &key) { + if (HasFusedActivation(key)) { + return fused_activation_space_map_.find(key)->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "The key (%d) of EpiloguePassActivationCache does not exist.", key)); + } + + void InsertFusedActivation(const std::string &key, ir::Node *const value) { + if (!HasFusedActivation(key)) { + mtx.lock(); + fused_activation_space_map_.insert({key, value}); + mtx.unlock(); + } else { + PADDLE_THROW(platform::errors::AlreadyExists( + "The key (%d) of EpiloguePassActivationCache already exist.", key)); + } + } + + private: + std::unordered_map fused_activation_space_map_; + std::mutex mtx; +}; + +class FuseGemmEpiloguePass : public FusePassBase { + public: + virtual ~FuseGemmEpiloguePass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const; + ir::Graph *FuseLinearActFwd(ir::Graph *graph, + const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const; + ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const; + ir::Graph *FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const; + + private: + bool IsGemmFromLinear_(const std::vector &x_shape, + const std::vector &w_shape, + OpDesc *matmul_v2_op) const; + const std::string GetReserveSpaceCacheKey(const std::string var_name, + int block_id) const { + return std::to_string(block_id) + var_name; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e4c9dc72128f4..d7d866fa98bb5 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1461,31 +1461,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()( return bn_grad; } -PDNode *patterns::ElewiseAddAct::operator()( - paddle::framework::ir::PDNode *ele_x_var, - std::unordered_set act_types) { - auto *ele_y_var = pattern->NewNode(ele_y_repr()) - ->assert_is_op_input("elementwise_add", "Y"); - - auto *ele_add = - pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); - - auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) - ->assert_is_op_output("elementwise_add", "Out"); - - ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); - - auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); - - auto *act_out_var = - pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); - - ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); - act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); - - return act_out_var; -} - PDNode *patterns::ElewiseAddActInplaceGrad::operator()( paddle::framework::ir::PDNode *d_act_out_var, std::unordered_set act_types) { @@ -1526,6 +1501,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::LinearAct::operator()( + paddle::framework::ir::PDNode *linear_x_var, + const std::unordered_set &act_types, bool with_grad_link, + bool is_act_grad_x_from_act) { + auto *matmul_w_var = + pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y"); + + auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2"); + + auto *matmul_out_var = pattern->NewNode(matmul_out_repr()) + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X"); + + auto *ele_bias_var = pattern->NewNode(ele_bias_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var}); + ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var}); + + if (with_grad_link) { + matmul_out_var->assert_is_op_input("elementwise_add_grad", "X"); + auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad") + ->assert_is_op("elementwise_add_grad"); + elementwise_add_grad_op->LinksFrom({matmul_out_var}); + } + + if (act_types.size() > 0) { + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_output(act_types, "Out"); + + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + if (with_grad_link && !is_act_grad_x_from_act) { + std::unordered_set act_grad_types; + for (const auto &act : act_types) { + std::string act_grad(act); + act_grad.append("_grad"); + act_grad_types.insert(act_grad); + } + + ele_out_var->assert_is_ops_input(act_grad_types, "X"); + auto *act_grad_op = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + act_grad_op->LinksFrom({ele_out_var}); + } + + return act_out_var; + } + + return ele_out_var; +} + +PDNode *patterns::ElewiseAddMatmulAct::operator()( + paddle::framework::ir::PDNode *dout_var, + const std::unordered_set &act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act) { + auto *ele_grad_bias_var = + pattern->NewNode(ele_grad_bias_repr()) + ->assert_is_op_input("elementwise_add_grad", "Y"); + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + auto *ele_grad_dx_var = + pattern->NewNode(ele_grad_dx_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + auto *ele_grad_dbias_var = + pattern->NewNode(ele_grad_dbias_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var}) + .LinksTo({ele_grad_dx_var, ele_grad_dbias_var}); + + ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad", + GradVarName("Out")); + + auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr()) + ->assert_is_op_input("matmul_v2_grad", "X"); + auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr()) + ->assert_is_op_input("matmul_v2_grad", "Y"); + auto *matmul_grad = + pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad"); + auto *matmul_grad_dx_var = + pattern->NewNode(matmul_grad_dx_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("X")); + auto *matmul_grad_dw_var = + pattern->NewNode(matmul_grad_dw_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("Y")); + matmul_grad->LinksFrom( + {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var}); + if (without_x_gradient) { + matmul_grad->LinksTo({matmul_grad_dw_var}); + } else { + matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var}); + } + + if (!without_x_gradient && act_grad_types.size() > 0) { + matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input( + act_grad_types, GradVarName("Out")); + + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *act_grad_dx_var = + pattern->NewNode(act_grad_dx_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")); + + auto *act_grad_x_var = matmul_grad_x_var; + if (!is_act_grad_x_from_act) { + auto *ele_out_var = pattern->NewNode(ele_out_repr()) + ->assert_is_ops_input(act_grad_types, "X"); + act_grad_x_var = ele_out_var; + } + + act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var}) + .LinksTo({act_grad_dx_var}); + return act_grad; + } + + return matmul_grad; +} + // conv_type: conv2d, conv3d, conv2d_transpose PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, std::string conv_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d6400ed6945bf..0f21906d08d0e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -863,6 +863,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(ele_y); }; +// The following patterns are used to fuse linear and act (ReLu or GeLU) +// formula: act(F.linear(x)) +// op: matmul_v2 + elementwise_add + act +// named nodes: matmul, elementwise_add, act +// matmul_w, matmul_out +// ele_bias, elewise_add_out, act_out +struct LinearAct : public PatternBase { + LinearAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "linear_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_types, + bool with_grad_link, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(matmul); + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(matmul_w); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_bias); + PATTERN_DECL_NODE(act_out); +}; + +// The following patterns are used to fuse linear_grad and act_grad (ReLu or +// GeLU) +// formula: the backward of F.linear( act(x) ) +// op: elementwise_add_grad + matmul_v2_grad + act_grad +// named nodes: ele_add_grad, matmul_grad, act_grad +// ele_grad_bias, ele_grad_dx, ele_grad_dbias +// matmul_grad_x, matmul_grad_dx, matmul_grad_dx +// matmul_grad_dw, act_grad_dx +struct ElewiseAddMatmulAct : public PatternBase { + ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add_grad); + PATTERN_DECL_NODE(matmul_grad); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(ele_grad_bias); + PATTERN_DECL_NODE(ele_grad_dx); + PATTERN_DECL_NODE(ele_grad_dbias); + PATTERN_DECL_NODE(matmul_grad_x); + PATTERN_DECL_NODE(matmul_grad_w); + PATTERN_DECL_NODE(matmul_grad_dx); + PATTERN_DECL_NODE(matmul_grad_dw); + PATTERN_DECL_NODE(act_grad_dx); +}; + // Conv with Elementwise_add as bias // op: conv + elementwise_add // named nodes: diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 67287afa6ae50..80e7f5c001d4b 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -19,7 +19,8 @@ register_operators(EXCLUDES fused_attention_op fused_transformer_op fused_feedforward_op - resnet_unit_op) + resnet_unit_op + fused_gemm_epilogue_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() + + if (CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc new file mode 100644 index 0000000000000..4c4e3661e6d6e --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -0,0 +1,353 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedGemmEpilogueOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias", + "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGemmEpilogueOp"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto bias_dims = ctx->GetInputDim("Bias"); + + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + bias_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input tensor bias's dimension of FusedGemmEpilogueOp " + " should be == 1, but got %d.", + bias_dims.size())); + + PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1], + platform::errors::InvalidArgument( + "The Input tensor bias's dimension 0" + " should be == Y[-1], but got bias's shape = [%s] " + "and Y's shape = [%s]", + bias_dims, y_dims)); + + auto x_mat_dims = + phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1); + + int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1]; + int K_from_y = trans_y ? y_dims[1] : y_dims[0]; + + PADDLE_ENFORCE_EQ( + K_from_x, K_from_y, + platform::errors::InvalidArgument( + "The last dimension of X should be equal with Y's first dimension." + "But received X[-1] = [%d], Y[0] = [%d].", + K_from_x, K_from_y)); + + auto activation = ctx->Attrs().Get("activation"); + + if ((activation != "relu") && (activation != "gelu") && + (activation != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + + if (activation == "none" && ctx->HasOutput("ReserveSpace")) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The ReserveSpace would not be used when activation = \"none\"")); + } + + // cublasLt's restriction for auxiliary. + if (ctx->HasOutput("ReserveSpace") && activation != "none") { + int min_size_of_n = activation == "relu" ? 128 : 8; + int N_size = trans_y ? y_dims[0] : y_dims[1]; + PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0, + platform::errors::InvalidArgument( + "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) " + "should be multiple of %d when auxiliary_key given " + "and activation=%s, but got N = %d.", + min_size_of_n, activation, N_size)); + } + + std::vector out_dims; + out_dims.reserve(static_cast(x_dims.size())); + if (trans_x) { + for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]); + } else { + for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]); + } + + if (trans_y) { + out_dims.push_back(y_dims[0]); + } else { + out_dims.push_back(y_dims[1]); + } + + ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); + // Note (Ming Huang): Reserve space of relu is a bit-mask, + // which cannot pass nan_and_inf checking if shape is set. + if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) { + ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias)."); + AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias)."); + AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias)."); + + AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias)."); + AddOutput("ReserveSpace", + R"DOC(Reserve GPU space to place + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable() + .AsExtra(); + + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); + + AddAttr( + "activation", + R"DOC((string, default none), The activation function. It could be + one of {none, relu, gelu}. When none is given, Act would be null + operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogue Operator +This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)). +It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU). + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut", + "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp"); + + auto dout_dims = ctx->GetInputDim("DOut"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_GE( + dout_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + dout_dims.size())); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueGradOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + dout_dims.size(), x_dims.size(), + platform::errors::InvalidArgument( + "The Input tensor DOut's and X's dimension of " + "FusedGemmEpilogueGradOp " + " should be the same, but got DOut's dim = %d and X's = %d.", + dout_dims.size(), x_dims.size())); + + auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1); + + auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[1], y_dims[1], + platform::errors::InvalidArgument( + "The last dimension of DOut should be equal with Y's last" + "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", + dout_mat_dims[1], y_dims[1])); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[0], x_mat_dims[0], + platform::errors::InvalidArgument( + "The first dimension of DOut should be equal with X's first" + "dimension. But received DOut[0] = [%d], Y[0] = [%d].", + dout_mat_dims[0], x_mat_dims[0])); + + auto activation_grad = ctx->Attrs().Get("activation_grad"); + if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") && + (activation_grad != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation_grad)); + } + + if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "The ReserveSpace should not be empty. " + "when activation_grad == {relu_grad, gelu_grad}.")); + } + + if (ctx->HasOutput("DX")) { + std::vector dx_dims; + dx_dims.reserve(static_cast(x_dims.size())); + for (int i = 0; i < x_dims.size(); ++i) { + dx_dims.push_back(x_dims[i]); + } + ctx->SetOutputDim("DX", phi::make_ddim(dx_dims)); + } + + std::vector dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size()); + ctx->SetOutputDim("DY", phi::make_ddim(dy_dims)); + + if (ctx->HasOutput("DBias")) { + std::vector dbias_dims; + dbias_dims.push_back(y_dims[1]); + ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DOut", + "The input grad tensor to Out of Out = (Act(X) * Y) + bias"); + AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias"); + AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias"); + AddInput("ReserveSpace", + R"DOC(A GPU space to fetch + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue_grad op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable(); + + AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + AddOutput("DY", + "The output grad tensor to Y of Out = (Act(X) * Y) + bias."); + AddOutput("DBias", + "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + + AddAttr( + "activation_grad", + R"DOC((string, default none), The backward activation function. It could be + one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would + be null operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogueGrad Operator +This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias). +It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear. + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, + ops::FusedGemmEpilogueGradOpMaker) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu new file mode 100644 index 0000000000000..e16c9e8f483cc --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -0,0 +1,376 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + bool enable_auxiliary = reserve_space == nullptr ? false : true; + + out->mutable_data(ctx.GetPlace()); + auto* out_data = out->data(); + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtMatmulDesc_t operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc, compute_type, scale_type)); + cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, + sizeof(transx))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, + sizeof(transy))); + + cublasLtEpilogue_t epiloque_func = + get_epilogue_type_(activation, enable_auxiliary); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func, + sizeof(epiloque_func))); + const T* bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data, + sizeof(bias_data))); + + if (enable_auxiliary && activation != "none") { + size_t reserve_space_size = 0; + if (activation == "relu") { + // Count in bits. + reserve_space_size = phi::product(out->dims()) / 8; + } else { + reserve_space_size = phi::product(out->dims()) * sizeof(T); + } + reserve_space->mutable_data(ctx.GetPlace(), out->type(), + reserve_space_size); + void* aux_data = reinterpret_cast(reserve_space->data()); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; + if (trans_x) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, M, K, M)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + if (trans_y) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, K, N, K)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc, mat_type, N, M, N)); + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx, workspace_size); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, operation_desc, alpha, y->data(), y_desc, x->data(), + x_desc, beta, out_data, out_desc, out_data, out_desc, algo, + workspace->ptr(), workspace_size, stream)); + } + + private: + static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation, + bool enable_auxiliary) { + if (activation == "relu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS + : CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == "gelu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS + : CUBLASLT_EPILOGUE_GELU_BIAS; + } else if (activation == "none") { + return CUBLASLT_EPILOGUE_BIAS; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } +}; + +template +class FusedGemmEpilogueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation_grad = ctx.Attr("activation_grad"); + + auto dout_mat_dims = + phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); + auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + + int64_t M = x_mat_dims[0]; + int64_t K = y->dims()[0]; + int64_t N = y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + cublasOperation_t trans_dout = CUBLAS_OP_N; + cublasLtMatrixLayout_t dout_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dout_desc, mat_type, N, M, N)); + + if (dx) { + cublasLtMatmulDesc_t dx_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dx_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_y = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, + sizeof(trans_y))); + cublasLtEpilogue_t epiloque_func_for_dx = + get_epilogue_type_(activation_grad); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dx, sizeof(epiloque_func_for_dx))); + + if (activation_grad != "none") { + auto* aux_data = reserve_space->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, K, M, K)); + + memory::allocation::AllocationPtr dx_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dx->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dx_operation_desc, alpha, y->data(), y_desc, + dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, + algo, dx_workspace->ptr(), workspace_size, stream)); + } + + if (dy) { + cublasLtMatmulDesc_t dy_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dy_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, + sizeof(trans_x))); + cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr + ? CUBLASLT_EPILOGUE_DEFAULT + : CUBLASLT_EPILOGUE_BGRADA; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); + + if (dbias) { + dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dbias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &dbias_data, sizeof(dbias_data))); + } + + cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, N, K, N)); + + memory::allocation::AllocationPtr dy_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dy_operation_desc, alpha, dout->data(), dout_desc, + x->data(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo, + dy_workspace->ptr(), workspace_size, stream)); + } + } + + private: + static cublasLtEpilogue_t get_epilogue_type_( + const std::string& activation_grad) { + if (activation_grad == "relu_grad") { + return CUBLASLT_EPILOGUE_DRELU; + } else if (activation_grad == "gelu_grad") { + return CUBLASLT_EPILOGUE_DGELU; + } else if (activation_grad == "none") { + return CUBLASLT_EPILOGUE_DEFAULT; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation_grad attribute of fused_gemm_epilogue op should " + "be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation_grad=%s.", + activation_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDA_VERSION >= 11060 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); +#endif diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h index ab7d474c1ac38..a32db3a9921e3 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h @@ -19,6 +19,7 @@ #include "paddle/fluid/platform/device/gpu/gpu_types.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -110,5 +111,28 @@ class CublasHandleHolder { mutable std::mutex mtx_; }; +class CublasLtHandleHolder { + public: + CublasLtHandleHolder() { + PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtCreate(&handle_)); + } + const cublasLtHandle_t& GetCublasLtHandle() const { return handle_; } + + ~CublasLtHandleHolder() PADDLE_MAY_THROW { + PADDLE_RETRY_CUDA_SUCCESS(dynload::cublasLtDestroy(handle_)); + } + + inline void Call(const std::function& callback) const { + std::lock_guard guard(mtx_); + callback(handle_); + } + + private: + DISABLE_COPY_AND_ASSIGN(CublasLtHandleHolder); + + cublasLtHandle_t handle_; + mutable std::mutex mtx_; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h index d7362fe9cbd81..d0b48eca5021b 100644 --- a/paddle/fluid/platform/device/gpu/gpu_types.h +++ b/paddle/fluid/platform/device/gpu/gpu_types.h @@ -1,4 +1,5 @@ // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -24,6 +25,7 @@ #else #include #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/dynload/cudnn.h" #endif @@ -70,6 +72,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +// TODO(Ming Huang): Since there is no blasLt handler, +// use rocblas_handle for workround. +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); + using CUDAGraphID = unsigned long long; // NOLINT #undef DECLARE_TYPE_FOR_GPU diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index f60cbc48694ff..18ac979b48ef3 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -1,4 +1,6 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -465,6 +467,9 @@ CUDAContext::CUDAContext(const CUDAPlace& place, InitCuBlasContext(); InitCuDNNContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif InitCuSparseContext(); InitCuSolverContext(); #endif @@ -476,6 +481,9 @@ void CUDAContext::SetStream(gpuStream_t stream) { DestoryCuDNNContext(); DestoryCuBlasContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + DestoryCuBlasLtContext(); +#endif DestoryCuSolverContext(); #endif @@ -485,6 +493,9 @@ void CUDAContext::SetStream(gpuStream_t stream) { InitCuBlasContext(); InitCuDNNContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif InitCuSolverContext(); #endif } @@ -495,6 +506,9 @@ CUDAContext::~CUDAContext() { DestoryCuDNNContext(); DestoryCuBlasContext(); #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + InitCuBlasLtContext(); +#endif DestoryCuSparseContext(); DestoryCuSolverContext(); #endif @@ -551,6 +565,14 @@ cublasHandle_t CUDADeviceContext::cublas_handle() const { } return phi::GPUContext::cublas_handle(); } +#if CUDA_VERSION >= 11060 +cublasLtHandle_t CUDADeviceContext::cublaslt_handle() const { + if (thread_ctx_.count(this)) { + return context()->CublasLtHandle()->GetCublasLtHandle(); + } + return phi::GPUContext::cublaslt_handle(); +} +#endif cusparseHandle_t CUDADeviceContext::cusparse_handle() const { if (thread_ctx_.count(this)) { return context()->CusparseHandle()->GetCusparseHandle(); diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 29b6477b68374..e104170ca2495 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -1,4 +1,6 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -29,6 +31,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusparse.h" @@ -332,6 +335,12 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + const std::unique_ptr& CublasLtHandle() const { + return cublaslt_handle_; + } +#endif + const std::unique_ptr& CusparseHandle() const { return cusparse_handle_; } @@ -348,6 +357,14 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + /*! \brief Call cublasLt function safely. */ + inline void CublasLtCall( + const std::function& callback) const { + cublaslt_handle_->Call(callback); + } +#endif + /*! \brief Call cusparse function safely. */ inline void CusparseCall( const std::function& callback) const { @@ -394,6 +411,12 @@ class CUDAContext { #endif #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + void InitCuBlasLtContext() { + cublaslt_handle_.reset(new CublasLtHandleHolder()); + } +#endif + void InitCuSparseContext() { cusparse_handle_.reset(new CusparseHandleHolder(RawStream())); } @@ -472,6 +495,10 @@ class CUDAContext { } #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + void DestoryCuBlasLtContext() { cublaslt_handle_.reset(); } +#endif + void DestoryCuSparseContext() { cusparse_handle_.reset(); } #endif @@ -497,6 +524,9 @@ class CUDAContext { std::unique_ptr cublas_tensor_core_handle_; std::unique_ptr cublas_tf32_tensor_core_handle_; #ifndef PADDLE_WITH_HIP +#if CUDA_VERSION >= 11060 + std::unique_ptr cublaslt_handle_; +#endif cusolverDnHandle_t cusolver_dn_handle_; std::unique_ptr cusparse_handle_; #endif @@ -559,6 +589,7 @@ class CUDADeviceContext : public phi::GPUContext { rocblas_handle cublas_handle() const; #else cublasHandle_t cublas_handle() const; + cublasLtHandle_t cublaslt_handle() const; cusparseHandle_t cusparse_handle() const; #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c016321ef802a..0a1cf604d2e8a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -3440,6 +3441,31 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = static.BuildStrategy() build_strategy.fuse_elewise_add_act_ops = True )DOC") + .def_property( + "fuse_gemm_epilogue", + [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_NE(self.IsFinalized(), true, + platform::errors::PreconditionNotMet( + "BuildStrategy has been finlaized, cannot be " + "configured again.")); + self.fuse_gemm_epilogue_ = b; + }, + R"DOC((bool, optional): fuse_gemm_epilogue indicate whether + to fuse matmul_op, elemenewist_add_op and activation_op, + it may make the execution faster. Default is False. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + build_strategy = static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + )DOC") .def_property( "fuse_bn_act_ops", [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h index d0787159e1e30..33daa2bba6b7d 100644 --- a/paddle/phi/backends/gpu/forwards.h +++ b/paddle/phi/backends/gpu/forwards.h @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -56,6 +57,9 @@ using cudnnFusedOpsPlan_t = struct cudnnFusedOpsPlanStruct *; // Forward declaration of cuBLAS types. using cublasHandle_t = struct cublasContext *; +// Forward declaration of cuBLASLt types. +using cublasLtHandle_t = struct cublasLtContext *; + // Forward declaration of cuSOLVER types. using cusolverDnHandle_t = struct cusolverDnContext *; diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index dbcc1660c6472..09deb575f2414 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -171,6 +172,7 @@ struct GPUContext::Impl { InitStream(); InitEigenDevice(); InitBlasHandle(); + InitBlasLtHandle(); InitDNNHandle(); InitSolverHandle(); InitSparseHandle(); @@ -183,6 +185,7 @@ struct GPUContext::Impl { InitGpuProperties(); InitStream(); InitBlasHandle(); + InitBlasLtHandle(); InitDNNHandle(); InitSolverHandle(); InitSparseHandle(); @@ -212,6 +215,7 @@ struct GPUContext::Impl { } #endif DestroyInternalBlasHandle(); + DestroyInternalBlasLtHandle(); DestoryInternalStream(); } @@ -418,6 +422,25 @@ struct GPUContext::Impl { void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; } + void InitBlasLtHandle() { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + phi::dynload::cublasLtCreate(&blaslt_handle_); +#endif + } + + void DestroyInternalBlasLtHandle() { +#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 + phi::dynload::cublasLtDestroy(blaslt_handle_); +#endif + } + + void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; } + + blasLtHandle_t GetBlasLtHandle() const { + PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr."); + return blaslt_handle_; + } + void InitDNNHandle() { if (phi::dynload::HasCUDNN()) { #ifdef PADDLE_WITH_HIP @@ -679,6 +702,7 @@ struct GPUContext::Impl { blasHandle_t blas_handle_{nullptr}; blasHandle_t blas_tensor_core_handle_{nullptr}; blasHandle_t blas_tf32_tensor_core_handle_{nullptr}; + blasLtHandle_t blaslt_handle_{nullptr}; dnnHandle_t dnn_handle_{nullptr}; solverHandle_t solver_handle_{nullptr}; sparseHandle_t sparse_handle_{nullptr}; @@ -725,6 +749,10 @@ blasHandle_t GPUContext::cublas_handle() const { return impl_->GetBlasHandle(); } +blasLtHandle_t GPUContext::cublaslt_handle() const { + return impl_->GetBlasLtHandle(); +} + solverHandle_t GPUContext::cusolver_dn_handle() const { return impl_->GetSolverHandle(); } @@ -815,6 +843,10 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) { impl_->SetBlasHandle(blas); } +void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) { + impl_->SetBlasLtHandle(blaslt); +} + void GPUContext::SetDnnHandle(dnnHandle_t handle) { impl_->SetDnnHandle(handle); } diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index b9d843982dc5e..3eb4360ad3538 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -1,4 +1,5 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Corporation. All rights reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -93,6 +94,9 @@ class GPUContext : public DeviceContext { /*! \brief Return cublas handle in the device context. */ blasHandle_t cublas_handle() const; + /*! \brief Return cublasLt handle in the device context. */ + blasLtHandle_t cublaslt_handle() const; + /*! \brief Return cusolver handle in the device context. */ solverHandle_t cusolver_dn_handle() const; @@ -193,6 +197,8 @@ class GPUContext : public DeviceContext { void SetBlasHandle(blasHandle_t); + void SetBlasLtHandle(blasLtHandle_t); + void SetDnnHandle(dnnHandle_t); void SetSolverHandle(solverHandle_t); diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h index 0be24392e1b40..4a6b9d2fd87f1 100644 --- a/paddle/phi/backends/gpu/gpu_decls.h +++ b/paddle/phi/backends/gpu/gpu_decls.h @@ -1,4 +1,5 @@ // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -59,6 +60,10 @@ DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t); DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle); +// TODO(Ming Huang): Since there is no blasLt handler, +// use rocblas_handle for workround. +DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); + DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle); DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1443eebf29384..f8102ec408043 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -125,6 +125,17 @@ if(NOT WITH_GPU) LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op) LIST(REMOVE_ITEM TEST_OPS test_fused_attention_op_api) LIST(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op) + LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass) +endif() + +if (WITH_GPU) + if (CUDA_VERSION LESS 11.6) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_op) + LIST(REMOVE_ITEM TEST_OPS test_fused_gemm_epilogue_grad_op) + LIST(REMOVE_ITEM TEST_OPS test_fuse_gemm_epilogue_pass) + endif() endif() if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py new file mode 100644 index 0000000000000..7f3180e21d8c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_gemm_epilogue_pass.py @@ -0,0 +1,392 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test cases for role makers.""" + +from __future__ import print_function +import paddle +import os +import unittest +import numpy as np +import paddle.fluid.core as core + + +def compare(ref, res, atol, rtol): + + ref = np.array(ref).flatten() + res = np.array(res).flatten() + + tmp_ref = ref.astype(np.float) + tol = atol + rtol * abs(tmp_ref) + + diff = abs(res - ref) + + indices = np.transpose(np.where(diff > tol)) + if len(indices) == 0: + return True + return False + + +def verify_node_count(graph, node_name, target_count): + count = 0 + for node in graph.nodes(): + if node.name() == node_name: + count += 1 + return count == target_count + + +class MultiFCLayer(paddle.nn.Layer): + def __init__(self, hidden, Activation): + super(MultiFCLayer, self).__init__() + self.linear1 = paddle.nn.Linear(hidden, hidden) + self.linear2 = paddle.nn.Linear(hidden, hidden) + self.linear3 = paddle.nn.Linear(hidden, hidden) + + self.relu1 = Activation() + self.relu2 = Activation() + self.relu3 = Activation() + + def forward(self, x, matmul_y, ele_y): + output = self.linear1(x) + output = self.relu1(output) + output = self.linear2(output) + + output1 = paddle.matmul(output, matmul_y) + output = self.linear3(output) + output = self.relu2(output) + + output = paddle.matmul(output, matmul_y) + output = paddle.add(output, ele_y) + output = self.relu3(output) + output = paddle.add(output, output1) + return output + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueFWDBase(unittest.TestCase): + def setUp(self): + self.batch = 64 + self.seqlen = 128 + self.hidden = 768 + + paddle.enable_static() + + self.main_prog = paddle.static.Program() + self.startup_prog = paddle.static.Program() + + with paddle.static.program_guard(self.main_prog, self.startup_prog): + data = paddle.static.data( + name="_data", + shape=[-1, self.seqlen, self.hidden], + dtype='float32') + matmul_y = paddle.static.data( + name="_matmul_y", + shape=[1, self.hidden, self.hidden], + dtype='float32') + ele_y = paddle.static.data( + name="_ele_y", shape=[self.hidden, ], dtype='float32') + + multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) + with paddle.static.amp.fp16_guard(): + out = multi_layer(data, matmul_y, ele_y) + self.loss = paddle.mean(out) + + self.data_arr = np.random.random( + (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 + self.matmul_y_arr = np.random.random( + (1, self.hidden, self.hidden)).astype("float32") - 0.5 + self.ele_y_arr = np.random.random( + (self.hidden, )).astype("float32") - 0.5 + + self.place = paddle.CUDAPlace(0) + self.exe = paddle.static.Executor(self.place) + self.exe.run(self.startup_prog) + + self._pre_test_hooks() + + self.feed = { + "_data": self.data_arr, + "_matmul_y": self.matmul_y_arr, + "_ele_y": self.ele_y_arr + } + self.reference = self.exe.run(self.main_prog, + feed=self.feed, + fetch_list=[self.loss.name]) + + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def _test_output(self): + build_strategy = paddle.static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + program = paddle.static.CompiledProgram(self.main_prog) + program = program.with_data_parallel( + loss_name=self.loss.name, + build_strategy=build_strategy, + places=paddle.static.cuda_places()) + + result = self.exe.run(program, + feed=self.feed, + fetch_list=[self.loss.name]) + self.assertTrue( + compare(self.reference, result, self.atol, self.rtol), + "[{}] outputs are miss-matched.".format(type(self).__name__)) + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue", 3), + "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". + format(type(self).__name__)) + act_fwd_name = self._get_act_type()[1] + self.assertTrue( + verify_node_count(program._graph, act_fwd_name, 1), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_fwd_name)) + + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu" + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReluFWDFP32(TestFuseGemmEpilogueFWDBase): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReluFWDFP16(TestFuseGemmEpilogueReluFWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeluFWDFP32(TestFuseGemmEpilogueFWDBase): + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.GELU, "gelu" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeluFWDFP16(TestFuseGemmEpilogueGeluFWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueBWDBase(unittest.TestCase): + def setUp(self): + self.batch = 64 + self.seqlen = 128 + self.hidden = 768 + + paddle.enable_static() + + self.main_prog = paddle.static.Program() + self.startup_prog = paddle.static.Program() + + with paddle.static.program_guard(self.main_prog, self.startup_prog): + data = paddle.static.data( + name="_data", + shape=[-1, self.seqlen, self.hidden], + dtype='float32') + matmul_y = paddle.static.data( + name="_matmul_y", + shape=[1, self.hidden, self.hidden], + dtype='float32') + ele_y = paddle.static.data( + name="_ele_y", shape=[self.hidden, ], dtype='float32') + + multi_layer = MultiFCLayer(self.hidden, self._get_act_type()[0]) + with paddle.static.amp.fp16_guard(): + out = multi_layer(data, matmul_y, ele_y) + self.loss = paddle.mean(out) + paddle.static.append_backward(loss=self.loss) + + self.data_arr = np.random.random( + (self.batch, self.seqlen, self.hidden)).astype("float32") - 0.5 + self.matmul_y_arr = np.random.random( + (1, self.hidden, self.hidden)).astype("float32") - 0.5 + self.ele_y_arr = np.random.random( + (self.hidden, )).astype("float32") - 0.5 + + self.place = paddle.CUDAPlace(0) + self.exe = paddle.static.Executor(self.place) + self.exe.run(self.startup_prog) + + self._pre_test_hooks() + + self.feed = { + "_data": self.data_arr, + "_matmul_y": self.matmul_y_arr, + "_ele_y": self.ele_y_arr + } + + self.fetch = [ + self.loss.name, + '{}.w_0@GRAD'.format(multi_layer.linear1.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear1.full_name()), + '{}.w_0@GRAD'.format(multi_layer.linear2.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear2.full_name()), + '{}.w_0@GRAD'.format(multi_layer.linear3.full_name()), + '{}.b_0@GRAD'.format(multi_layer.linear3.full_name()) + ] + self.outs_ref = self.exe.run(self.main_prog, + feed=self.feed, + fetch_list=self.fetch) + + @unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") + def _test_output(self): + build_strategy = paddle.static.BuildStrategy() + build_strategy.fuse_gemm_epilogue = True + program = paddle.static.CompiledProgram(self.main_prog) + program = program.with_data_parallel( + loss_name=self.loss.name, + build_strategy=build_strategy, + places=paddle.static.cuda_places()) + + outs_res = self.exe.run(program, feed=self.feed, fetch_list=self.fetch) + + for ref, res in zip(self.outs_ref, outs_res): + self.assertTrue( + compare(ref, res, self.atol, self.rtol), + "[{}] output is miss-matched.".format(type(self).__name__)) + + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue", 3), + "[{}] The number of fused_gemm_epilogue is miss-matched in the computing graph.". + format(type(self).__name__)) + self.assertTrue( + verify_node_count(program._graph, "fused_gemm_epilogue_grad", 3), + "[{}] The number of fused_gemm_epilogue_grad is miss-matched in the computing graph.". + format(type(self).__name__)) + _, act_fwd_name, act_bwd_name = self._get_act_type() + self.assertTrue( + verify_node_count(program._graph, act_fwd_name, 1), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_fwd_name)) + self.assertTrue( + verify_node_count(program._graph, act_bwd_name, 2), + "[{}] The number of {} is miss-matched in the computing graph.". + format(type(self).__name__, act_bwd_name)) + + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu", "relu_grad" + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReLUBWDFP32(TestFuseGemmEpilogueBWDBase): + def _pre_test_hooks(self): + self.atol = 1e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.ReLU, "relu", "relu_grad" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueReLUBWDFP16(TestFuseGemmEpilogueReLUBWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeLUBWDFP32(TestFuseGemmEpilogueBWDBase): + def _pre_test_hooks(self): + self.atol = 5e-4 + self.rtol = 1e-3 + + def _get_act_type(self): + return paddle.nn.GELU, "gelu", "gelu_grad" + + def test_output(self): + self._test_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGeLUBWDFP16(TestFuseGemmEpilogueGeLUBWDFP32): + def _pre_test_hooks(self): + self.atol = 1e-3 + self.rtol = 1e-2 + + fp16_var_list = paddle.static.amp.cast_model_to_fp16(self.main_prog) + paddle.static.amp.cast_parameters_to_fp16( + self.place, self.main_prog, to_fp16_var_names=fp16_var_list) + + self.data_arr = self.data_arr.astype("float16") + self.matmul_y_arr = self.matmul_y_arr.astype("float16") + self.ele_y_arr = self.ele_y_arr.astype("float16") + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py new file mode 100644 index 0000000000000..2ea1bf2e9cb81 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_grad_op.py @@ -0,0 +1,239 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci + + +def get_outputs(DOut, X, Y): + DX = np.dot(DOut, Y.T) + DY = np.dot(X.T, DOut) + DBias = np.sum(DOut, axis=0) + + return DX, DY, DBias + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP32( + TestFuseGemmEpilogueGradOpDXYBiasFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYBiasFP64( + TestFuseGemmEpilogueGradOpDXYBiasFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY, 'DBias': DBias} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP32( + TestFuseGemmEpilogueGradOpDYBiasFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYBiasFP64( + TestFuseGemmEpilogueGradOpDYBiasFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DY': DY} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP32(TestFuseGemmEpilogueGradOpDYFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDYFP64(TestFuseGemmEpilogueGradOpDYFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue_grad" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5, + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'], + self.inputs['Y']) + self.outputs = {'DX': DX, 'DY': DY} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP32(TestFuseGemmEpilogueGradOpDXYFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueGradOpDXYFP64(TestFuseGemmEpilogueGradOpDXYFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py new file mode 100644 index 0000000000000..f826898f9e5dd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_gemm_epilogue_op.py @@ -0,0 +1,450 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci + + +def gelu(x): + y_ref = 0.5 * x * ( + 1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))) + return y_ref.astype(x.dtype) + + +def relu(x): + mask = x > 0 + return x * mask + + +def get_output(X, Y, bias, act): + out = np.dot(X, Y) + bias + if act == 'relu': + return relu(out) + elif act == 'gelu': + return gelu(out) + else: + return out + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'relu') + } + self.attrs = {"activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP32(TestFuseGemmEpilogueOpReluMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP64(TestFuseGemmEpilogueOpReluMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].T, self.inputs['Y'], + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_x': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP32(TestFuseGemmEpilogueOpReluMTMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP64(TestFuseGemmEpilogueOpReluMTMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'].T, + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_y': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP32(TestFuseGemmEpilogueOpReluMMTFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMTFP64(TestFuseGemmEpilogueOpReluMMTFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((128, 4)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].T, self.inputs['Y'].T, + self.inputs['Bias'], 'relu') + } + self.attrs = {'trans_x': True, 'trans_y': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP32(TestFuseGemmEpilogueOpReluMTMTFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMTFP64(TestFuseGemmEpilogueOpReluMTMTFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP16MultiDimX(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((2, 2, 8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].reshape( + (-1, 4)), self.inputs['Y'], self.inputs['Bias'], + 'relu').reshape((2, 2, 8, 128)) + } + self.attrs = {"activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP32MultiDimX( + TestFuseGemmEpilogueOpReluMMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMMFP64MultiDimX( + TestFuseGemmEpilogueOpReluMMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP16MultiDimX(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((4, 2, 2, 8)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + self.outputs = { + 'Out': get_output(self.inputs['X'].reshape( + (4, -1)).T, self.inputs['Y'], self.inputs['Bias'], + 'relu').reshape((2, 2, 8, 128)) + } + self.attrs = {'trans_x': True, "activation": 'relu'} + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP32MultiDimX( + TestFuseGemmEpilogueOpReluMTMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpReluMTMFP64MultiDimX( + TestFuseGemmEpilogueOpReluMTMFP16MultiDimX): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'gelu'} + + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'gelu') + } + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP32(TestFuseGemmEpilogueOpGeluMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpGeluMMFP64(TestFuseGemmEpilogueOpGeluMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP16(OpTest): + def setUp(self): + self.op_type = "fused_gemm_epilogue" + self.place = core.CUDAPlace(0) + self.init_dtype_type() + + self.inputs = { + 'X': np.random.random((8, 4)).astype(self.dtype) - 0.5, + 'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5, + 'Bias': np.random.random((128, )).astype(self.dtype) - 0.5 + } + + self.attrs = {"activation": 'none'} + + self.outputs = { + 'Out': get_output(self.inputs['X'], self.inputs['Y'], + self.inputs['Bias'], 'none') + } + + def init_dtype_type(self): + self.dtype = np.float16 + self.atol = 1e-3 + + def test_check_output(self): + if self.dtype == np.float16 and not core.is_float16_supported( + self.place): + return + self.check_output_with_place(self.place, atol=self.atol) + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP32(TestFuseGemmEpilogueOpNoneMMFP16): + def init_dtype_type(self): + self.dtype = np.single + self.atol = 1e-6 + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestFuseGemmEpilogueOpNoneMMFP64(TestFuseGemmEpilogueOpNoneMMFP16): + def init_dtype_type(self): + self.dtype = np.double + self.atol = 1e-6 + + +if __name__ == "__main__": + np.random.seed(0) + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 7356f0c8db025..365047f7e8382 100755 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 NVIDIA Corporation. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -729,4 +730,6 @@ 'test_lu_op', 'test_margin_cross_entropy_op', 'test_pull_gpups_sparse_op', + 'test_fused_gemm_epilogue_op', + 'test_fused_gemm_epilogue_grad_op', ] From 1c29196e8de08edc18dbfc6c77ebcd22e595e1fd Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Mon, 7 Mar 2022 16:43:29 +0800 Subject: [PATCH 05/50] [Phi]Move bincount OP to phi (#39947) * move bincount OP to phi * fix dtype * set_dtype by weights or x * fix conflicts --- paddle/fluid/operators/bincount_op.cc | 62 ++------ paddle/fluid/operators/bincount_op.cu | 162 --------------------- paddle/fluid/operators/bincount_op.h | 109 -------------- paddle/phi/infermeta/binary.cc | 50 +++++++ paddle/phi/infermeta/binary.h | 4 + paddle/phi/kernels/bincount_kernel.h | 28 ++++ paddle/phi/kernels/cpu/bincount_kernel.cc | 106 ++++++++++++++ paddle/phi/kernels/gpu/bincount_kernel.cu | 164 ++++++++++++++++++++++ paddle/phi/ops/compat/bincount_sig.cc | 25 ++++ 9 files changed, 386 insertions(+), 324 deletions(-) delete mode 100644 paddle/fluid/operators/bincount_op.cu delete mode 100644 paddle/fluid/operators/bincount_op.h create mode 100644 paddle/phi/kernels/bincount_kernel.h create mode 100644 paddle/phi/kernels/cpu/bincount_kernel.cc create mode 100644 paddle/phi/kernels/gpu/bincount_kernel.cu create mode 100644 paddle/phi/ops/compat/bincount_sig.cc diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc index b37334a14bad4..062e7d510d54c 100644 --- a/paddle/fluid/operators/bincount_op.cc +++ b/paddle/fluid/operators/bincount_op.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bincount_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of BincountOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of BincountOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto minlength = ctx->Attrs().Get("minlength"); - - PADDLE_ENFORCE_GE(minlength, 0, - platform::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ(input_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (ctx->HasInput("Weights")) { - auto weights_dim = ctx->GetInputDim("Weights"); - PADDLE_ENFORCE_EQ(weights_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], input_dim[0], - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, input_dim)); - } - - ctx->SetOutputDim("Out", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = @@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor, + PD_INFER_META(phi::BincountInferMeta)); REGISTER_OPERATOR( bincount, ops::BincountOp, ops::BincountOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - bincount, ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel); + paddle::framework::EmptyGradOpMaker, + BincountInferShapeFunctor); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu deleted file mode 100644 index cc576d0af9287..0000000000000 --- a/paddle/fluid/operators/bincount_op.cu +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__global__ void KernelBincount(const InputT* input, const int total_elements, - const bool has_weights, const T* weights, - OutT* output) { - if (!has_weights) { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); - } - } else { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); - } - } -} - -template -void BincountCUDAInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - const int input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - - auto input_max_scala = framework::EigenScalar::From(input_max_t); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - - auto* place = context.template device_context().eigen_device(); - input_max_scala.device(*place) = input_x.maximum(); - input_min_scala.device(*place) = input_x.minimum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - - InputT input_min = input_min_cpu.data()[0]; - - PADDLE_ENFORCE_GE( - input_min, static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = - static_cast(input_max_cpu.data()[0]) + 1L; - - output_size = std::max(output_size, static_cast(minlength)); - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - const T* weights_data = has_weights ? weights->data() : nullptr; - - auto stream = - context.template device_context().stream(); - - if (!has_weights) { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } - } -} - -template -class BincountCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountCUDAInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountCUDAInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bincount, ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h deleted file mode 100644 index 84256bf78e4a1..0000000000000 --- a/paddle/fluid/operators/bincount_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void BincountInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - auto input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - - PADDLE_ENFORCE_GE( - *std::min_element(input_data, input_data + input_numel), - static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = static_cast(*std::max_element( - input_data, input_data + input_numel)) + - 1L; - output_size = std::max(output_size, static_cast(minlength)); - - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - if (has_weights) { - const T* weights_data = weights->data(); - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } - - } else { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += 1L; - } - } -} - -template -class BincountKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 94b489906c6d5..55230aa8d0516 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -456,6 +456,56 @@ void BCELossInferMeta(const MetaTensor& input, out->share_lod(input); } +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out) { + auto input_dim = x.dims(); + + PADDLE_ENFORCE_GE(minlength, + 0, + phi::errors::InvalidArgument( + "The minlength should be greater than or equal to 0." + "But received minlength is %d", + minlength)); + + PADDLE_ENFORCE_EQ( + input_dim.size(), + 1, + phi::errors::InvalidArgument("The 'shape' of Input(X) must be 1-D tensor." + "But the dimension of Input(X) is [%d]", + input_dim.size())); + + if (weights.is_initialized()) { + auto weights_dim = weights->dims(); + PADDLE_ENFORCE_EQ(weights_dim.size(), + 1, + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be 1-D tensor." + "But the dimension of Input(Weights) is [%d]", + weights_dim.size())); + + PADDLE_ENFORCE_EQ( + weights_dim[0], + input_dim[0], + phi::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be equal to the 'shape' of " + "Input(X)." + "But received: the 'shape' of Input(Weights) is [%s]," + "the 'shape' of Input(X) is [%s]", + weights_dim, + input_dim)); + } + out->set_dims(phi::make_ddim({-1})); + if (weights.is_initialized()) { + out->set_dtype(weights->dtype()); + } else { + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index caf9185c9005a..106c22f7548bd 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -85,6 +85,10 @@ void BCELossInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void BincountInferMeta(const MetaTensor& x, + const paddle::optional weights, + int minlength, + MetaTensor* out); void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, diff --git a/paddle/phi/kernels/bincount_kernel.h b/paddle/phi/kernels/bincount_kernel.h new file mode 100644 index 0000000000000..3ba69d365480f --- /dev/null +++ b/paddle/phi/kernels/bincount_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/bincount_kernel.cc b/paddle/phi/kernels/cpu/bincount_kernel.cc new file mode 100644 index 0000000000000..c9dc44c1e04eb --- /dev/null +++ b/paddle/phi/kernels/cpu/bincount_kernel.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bincount_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void BincountInner(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + const DenseTensor* input = &x; + DenseTensor* output = out; + const InputT* input_data = input->data(); + + auto input_numel = input->numel(); + + if (input_data == nullptr) { + phi::DDim out_dim{0}; + output->Resize(out_dim); + dev_ctx.template Alloc(output); + return; + } + + PADDLE_ENFORCE_GE( + *std::min_element(input_data, input_data + input_numel), + static_cast(0), + phi::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = static_cast(*std::max_element( + input_data, input_data + input_numel)) + + 1L; + output_size = std::max(output_size, static_cast(minlength)); + + phi::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = weights.is_initialized(); + + if (has_weights) { + const T* weights_data = weights->data(); + if (weights->dtype() == DataType::FLOAT32) { + float* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } else { + double* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } + + } else { + int64_t* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()(dev_ctx, output, 0L); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += 1L; + } + } +} + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + if (x.dtype() == DataType::INT32) { + BincountInner(dev_ctx, x, weights, minlength, out); + } else if (x.dtype() == DataType::INT64) { + BincountInner(dev_ctx, x, weights, minlength, out); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(bincount, + CPU, + ALL_LAYOUT, + phi::BincountKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/bincount_kernel.cu b/paddle/phi/kernels/gpu/bincount_kernel.cu new file mode 100644 index 0000000000000..a4ec894790cd3 --- /dev/null +++ b/paddle/phi/kernels/gpu/bincount_kernel.cu @@ -0,0 +1,164 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bincount_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +inline int GET_BLOCKS(const int N) { + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; +} + +template +__global__ void KernelBincount(const InputT* input, + const int total_elements, + const bool has_weights, + const T* weights, + OutT* output) { + if (!has_weights) { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); + } + } else { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], + static_cast(weights[i])); + } + } +} + +template +void BincountCUDAInner(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + const DenseTensor* input = &x; + DenseTensor* output = out; + const InputT* input_data = input->data(); + + const int input_numel = input->numel(); + + if (input_data == nullptr) { + phi::DDim out_dim{0}; + output->Resize(out_dim); + dev_ctx.template Alloc(output); + return; + } + auto input_x = EigenVector::Flatten(*input); + DenseTensor input_min_t, input_max_t; + input_max_t.Resize({1}); + auto* input_max_data = dev_ctx.template Alloc(&input_max_t); + input_min_t.Resize({1}); + auto* input_min_data = dev_ctx.template Alloc(&input_min_t); + + auto input_max_scala = EigenScalar::From(input_max_t); + auto input_min_scala = EigenScalar::From(input_min_t); + + auto* place = dev_ctx.eigen_device(); + input_max_scala.device(*place) = input_x.maximum(); + input_min_scala.device(*place) = input_x.minimum(); + + DenseTensor input_min_cpu, input_max_cpu; + paddle::framework::TensorCopySync( + input_max_t, phi::CPUPlace(), &input_max_cpu); + paddle::framework::TensorCopySync( + input_min_t, phi::CPUPlace(), &input_min_cpu); + + InputT input_min = input_min_cpu.data()[0]; + + PADDLE_ENFORCE_GE( + input_min, + static_cast(0), + phi::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = + static_cast(input_max_cpu.data()[0]) + 1L; + + output_size = std::max(output_size, static_cast(minlength)); + phi::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = weights.is_initialized(); + + const T* weights_data = has_weights ? weights->data() : nullptr; + auto stream = dev_ctx.stream(); + + if (!has_weights) { + int64_t* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()(dev_ctx, output, 0L); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + const auto& weights_type = + paddle::framework::TransToProtoVarType(weights->dtype()); + + if (weights->dtype() == DataType::FLOAT32) { + float* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + double* output_data = dev_ctx.template Alloc(output); + phi::funcs::SetConstant()( + dev_ctx, output, static_cast(0)); + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } + } +} + +template +void BincountKernel(const Context& dev_ctx, + const DenseTensor& x, + const paddle::optional weights, + int minlength, + DenseTensor* out) { + if (x.dtype() == DataType::INT32) { + BincountCUDAInner(dev_ctx, x, weights, minlength, out); + } else if (x.dtype() == DataType::INT64) { + BincountCUDAInner(dev_ctx, x, weights, minlength, out); + } +} +} // namespace phi + +PD_REGISTER_KERNEL(bincount, + GPU, + ALL_LAYOUT, + phi::BincountKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/ops/compat/bincount_sig.cc b/paddle/phi/ops/compat/bincount_sig.cc new file mode 100644 index 0000000000000..35067c256ed49 --- /dev/null +++ b/paddle/phi/ops/compat/bincount_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BincountOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("bincount", {"X", "Weights"}, {"minlength"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bincount, phi::BincountOpArgumentMapping); From c09adab84fdd1fb13ac751871787d3337ba3ca77 Mon Sep 17 00:00:00 2001 From: houj04 <35131887+houj04@users.noreply.github.com> Date: Mon, 7 Mar 2022 16:57:56 +0800 Subject: [PATCH 06/50] refactor unittest for nearest_interp_v2_op_xpu. test=kunlun (#39804) * refactor unittest for nearest_interp_v2_op_xpu. test=kunlun * fix code style. test=kunlun * fix code style. test=kunlun --- .../xpu/test_nearest_interp_v2_op_xpu.py | 731 +++++++++--------- 1 file changed, 349 insertions(+), 382 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py index 8c1ce68e9d0f8..7a3b4a5a2179a 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_nearest_interp_v2_op_xpu.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,13 +16,14 @@ import unittest import numpy as np -import paddle -import paddle.fluid.core as core import sys sys.path.append("..") + +import paddle + +from op_test import OpTest from op_test_xpu import XPUOpTest -import paddle.fluid as fluid -from paddle.fluid import Program, program_guard +from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper paddle.enable_static() @@ -158,390 +159,356 @@ def nearest_neighbor_interp3d_np(X, return out.astype(X.dtype) -class TestNearestInterpOp(XPUOpTest): - def setUp(self): - self.use_xpu = True - self.out_size = None - self.actual_shape = None - self.data_layout = 'NCHW' - self.init_test_case() - self.op_type = "nearest_interp_v2" - input_np = np.random.random(self.input_shape).astype("float32") - - if self.data_layout == "NCHW" and len(self.input_shape) == 4: - in_d = 1 - in_h = self.input_shape[2] - in_w = self.input_shape[3] - else: - in_d = 1 - in_h = self.input_shape[1] - in_w = self.input_shape[2] - - if self.data_layout == "NCDHW" and len(self.input_shape) == 5: - in_d = self.input_shape[2] - in_h = self.input_shape[3] - in_w = self.input_shape[4] - else: - in_d = self.input_shape[1] - in_h = self.input_shape[2] - in_w = self.input_shape[3] - scale_d = 0 - scale_h = 0 - scale_w = 0 - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - scale_d = scale_h = scale_w = float(self.scale) - if isinstance(self.scale, list) and len(self.scale) == 1: - scale_d = scale_w = scale_h = self.scale[0] - elif isinstance(self.scale, list) and len(self.scale) > 1: - if len(self.scale) == 5: - scale_w = self.scale[2] - scale_h = self.scale[1] - scale_d = self.scale[0] - else: - scale_w = self.scale[1] - scale_h = self.scale[0] +class XPUNearestInterpOpWrapper(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'nearest_interp_v2' + self.use_dynamic_create_class = False - out_h = int(in_h * scale_h) - out_w = int(in_w * scale_w) - out_d = int(in_d * scale_d) - else: - if len(self.input_shape) == 5: - out_d = self.out_d - out_h = self.out_h - out_w = self.out_w + class TestNearestInterpOp(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() - if len(self.input_shape) == 4: - output_np = nearest_neighbor_interp_np( - input_np, out_h, out_w, scale_h, scale_w, self.out_size, - self.actual_shape, self.align_corners, self.data_layout) - elif len(self.input_shape) == 5: - output_np = nearest_neighbor_interp3d_np( - input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w, - self.out_size, self.actual_shape, self.align_corners, - self.data_layout) - self.inputs = {'X': input_np} - if self.out_size is not None: - self.inputs['OutSize'] = self.out_size - if self.actual_shape is not None: - self.inputs['OutSize'] = self.actual_shape - if len(self.input_shape) == 5: - self.attrs = { - 'out_d': self.out_d, - 'out_h': self.out_h, - 'out_w': self.out_w, - 'interp_method': self.interp_method, - 'align_corners': self.align_corners, - 'data_layout': self.data_layout - } - else: + self.out_size = None + self.actual_shape = None + self.data_layout = 'NCHW' + + self.interp_method = 'nearest' + self.scale = 0. + self.align_corners = True + + self.init_test_case() + self.op_type = "nearest_interp_v2" + input_np = np.random.random(self.input_shape).astype(self.dtype) + + # in + if self.data_layout == "NCHW" and len(self.input_shape) == 4: + in_d = 1 + in_h = self.input_shape[2] + in_w = self.input_shape[3] + else: + in_d = 1 + in_h = self.input_shape[1] + in_w = self.input_shape[2] + + if self.data_layout == "NCDHW" and len(self.input_shape) == 5: + in_d = self.input_shape[2] + in_h = self.input_shape[3] + in_w = self.input_shape[4] + else: + in_d = self.input_shape[1] + in_h = self.input_shape[2] + in_w = self.input_shape[3] + + # scale + scale_d = 0 + scale_h = 0 + scale_w = 0 + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_d = scale_h = scale_w = float(self.scale) + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_d = scale_w = scale_h = self.scale[0] + self.scale = [self.scale[0], self.scale[0]] + elif isinstance(self.scale, list) and len(self.scale) > 1: + if len(self.scale) == 5: + scale_w = self.scale[2] + scale_h = self.scale[1] + scale_d = self.scale[0] + else: + scale_w = self.scale[1] + scale_h = self.scale[0] + + out_h = int(in_h * scale_h) + out_w = int(in_w * scale_w) + out_d = int(in_d * scale_d) + else: + if len(self.input_shape) == 5: + out_d = self.out_d + out_h = self.out_h + out_w = self.out_w + + # output_np + if len(self.input_shape) == 4: + output_np = nearest_neighbor_interp_np( + input_np, out_h, out_w, scale_h, scale_w, self.out_size, + self.actual_shape, self.align_corners, self.data_layout) + elif len(self.input_shape) == 5: + output_np = nearest_neighbor_interp3d_np( + input_np, out_d, out_h, out_w, scale_d, scale_h, scale_w, + self.out_size, self.actual_shape, self.align_corners, + self.data_layout) + self.outputs = {'Out': output_np} + + self.inputs = {'X': input_np} + if self.out_size is not None: + self.inputs['OutSize'] = self.out_size + if self.actual_shape is not None: + self.inputs['OutSize'] = self.actual_shape + + if len(self.input_shape) == 5: + self.attrs = { + 'out_d': self.out_d, + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + else: + self.attrs = { + 'out_h': self.out_h, + 'out_w': self.out_w, + 'interp_method': self.interp_method, + 'align_corners': self.align_corners, + 'data_layout': self.data_layout + } + + if self.scale: + self.attrs['scale'] = self.scale + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.input_shape = [2, 3, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.out_size = np.array([3, 3]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support 5-dim input_shape + class TestNearestNeighborInterpCase1(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [4, 1, 1, 7, 8] + self.out_d = 1 + self.out_h = 1 + self.out_w = 1 + self.scale = 0. + self.align_corners = True + """ + + class TestNearestNeighborInterpCase2(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + + class TestNearestNeighborInterpCase3(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + + class TestNearestNeighborInterpCase4(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [4, 1, 7, 8] + self.out_h = 1 + self.out_w = 1 + self.out_size = np.array([2, 2]).astype("int32") + + class TestNearestNeighborInterpCase5(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = np.array([11, 11]).astype("int32") + + class TestNearestNeighborInterpCase6(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [1, 1, 32, 64] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([65, 129]).astype("int32") + + class TestNearestNeighborInterpSame(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [2, 3, 32, 64] + self.out_h = 32 + self.out_w = 64 + + class TestNearestNeighborInterpActualShape(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support NHWC data_layout + class TestNearestNeighborInterpDataLayout(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [2, 4, 4, 5] + self.out_h = 2 + self.out_w = 2 + self.scale = 0. + self.out_size = np.array([3, 8]).astype("int32") + self.align_corners = True + self.data_layout = "NHWC" + """ + + class TestNearestInterpWithoutCorners(TestNearestInterpOp): + def set_align_corners(self): + self.align_corners = False + + class TestNearestNeighborInterpScale1(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = 2. + self.out_size = np.array([66, 40]).astype("int32") + + class TestNearestNeighborInterpScale2(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 5, 7] + self.out_h = 64 + self.out_w = 32 + self.scale = 1.5 + self.out_size = np.array([66, 40]).astype("int32") + + class TestNearestNeighborInterpScale3(TestNearestInterpOp): + def init_test_case(self): + self.input_shape = [3, 2, 7, 5] + self.out_h = 64 + self.out_w = 32 + self.scale = [2.0, 3.0] + self.out_size = np.array([66, 40]).astype("int32") + + """ + # case copied form gpu but disabled in xpu: not support 5-dim input_shape + class TestNearestNeighbor3DInterp(TestNearestInterpOp): + def init_test_case(self): + self.interp_method = 'nearest' + self.input_shape = [3, 2, 4, 7, 5] + self.out_d = 8 + self.out_h = 64 + self.out_w = 32 + self.scale = [4.0, 2.0, 3.0] + self.out_size = np.array([8, 66, 40]).astype("int32") + self.align_corners = True + """ + + class TestNearestInterpOp_attr_tensor(XPUOpTest): + def setUp(self): + self.place = paddle.XPUPlace(0) + self.init_dtype() + + self.out_size = None + self.actual_shape = None + + self.interp_method = 'nearest' + self.scale = 0. + self.align_corners = True + + self.init_test_case() + self.op_type = "nearest_interp_v2" + self.shape_by_1Dtensor = False + self.scale_by_1Dtensor = False self.attrs = { - 'out_h': self.out_h, - 'out_w': self.out_w, 'interp_method': self.interp_method, 'align_corners': self.align_corners, - 'data_layout': self.data_layout } - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - self.scale = [self.scale] - if isinstance(self.scale, list) and len(self.scale) == 1: - self.scale = [self.scale[0], self.scale[0]] - self.attrs['scale'] = self.scale - self.outputs = {'Out': output_np} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', in_place=True) - - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 3, 4, 5] - self.out_h = 2 - self.out_w = 2 - self.scale = 0. - self.out_size = np.array([3, 3]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support 5-dim input_shape -class TestNearestNeighborInterpCase1(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [4, 1, 1, 7, 8] - self.out_d = 1 - self.out_h = 1 - self.out_w = 1 - self.scale = 0. - self.align_corners = True -""" - - -class TestNearestNeighborInterpCase2(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpCase3(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [1, 1, 32, 64] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpCase4(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [4, 1, 7, 8] - self.out_h = 1 - self.out_w = 1 - self.scale = 0. - self.out_size = np.array([2, 2]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpCase5(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.out_size = np.array([11, 11]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpCase6(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [1, 1, 32, 64] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([65, 129]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpSame(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 3, 32, 64] - self.out_h = 32 - self.out_w = 64 - self.scale = 0. - self.align_corners = True - - -class TestNearestNeighborInterpActualShape(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support NHWC data_layout -class TestNearestNeighborInterpDataLayout(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 4, 4, 5] - self.out_h = 2 - self.out_w = 2 - self.scale = 0. - self.out_size = np.array([3, 8]).astype("int32") - self.align_corners = True - self.data_layout = "NHWC" -""" - - -class TestNearestInterpWithoutCorners(TestNearestInterpOp): - def set_align_corners(self): - self.align_corners = False - - -class TestNearestNeighborInterpScale1(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 7, 5] - self.out_h = 64 - self.out_w = 32 - self.scale = 2. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpScale2(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 5, 7] - self.out_h = 64 - self.out_w = 32 - self.scale = 1.5 - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -class TestNearestNeighborInterpScale3(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 7, 5] - self.out_h = 64 - self.out_w = 32 - self.scale = [2.0, 3.0] - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - - -""" -# case copied form gpu but disabled in xpu: not support 5-dim input_shape -class TestNearestNeighbor3DInterp(TestNearestInterpOp): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 4, 7, 5] - self.out_d = 8 - self.out_h = 64 - self.out_w = 32 - self.scale = [4.0, 2.0, 3.0] - self.out_size = np.array([8, 66, 40]).astype("int32") - self.align_corners = True -""" - - -class TestNearestInterpOp_attr_tensor(XPUOpTest): - def setUp(self): - self.use_xpu = True - self.out_size = None - self.actual_shape = None - self.init_test_case() - self.op_type = "nearest_interp_v2" - self.shape_by_1Dtensor = False - self.scale_by_1Dtensor = False - self.attrs = { - 'interp_method': self.interp_method, - 'align_corners': self.align_corners, - } - - input_np = np.random.random(self.input_shape).astype("float32") - self.inputs = {'X': input_np} - - if self.scale_by_1Dtensor: - self.inputs['Scale'] = np.array([self.scale]).astype("float32") - elif self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - scale_h = scale_w = float(self.scale) - if isinstance(self.scale, list) and len(self.scale) == 1: - scale_w = scale_h = self.scale[0] - elif isinstance(self.scale, list) and len(self.scale) > 1: - scale_w = self.scale[1] - scale_h = self.scale[0] - out_h = int(self.input_shape[2] * scale_h) - out_w = int(self.input_shape[3] * scale_w) - else: - out_h = self.out_h - out_w = self.out_w - - if self.shape_by_1Dtensor: - self.inputs['OutSize'] = self.out_size - elif self.out_size is not None: - size_tensor = [] - for index, ele in enumerate(self.out_size): - size_tensor.append(("x" + str(index), np.ones( - (1)).astype('int32') * ele)) - self.inputs['SizeTensor'] = size_tensor - - self.attrs['out_h'] = self.out_h - self.attrs['out_w'] = self.out_w - if self.scale: - if isinstance(self.scale, float) or isinstance(self.scale, int): - if self.scale > 0: - self.scale = [self.scale] - if isinstance(self.scale, list) and len(self.scale) == 1: - self.scale = [self.scale[0], self.scale[0]] - self.attrs['scale'] = self.scale - output_np = nearest_neighbor_interp_np(input_np, out_h, out_w, 0, 0, - self.out_size, self.actual_shape, - self.align_corners) - self.outputs = {'Out': output_np} - - def test_check_output(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - if paddle.is_compiled_with_xpu(): - place = paddle.XPUPlace(0) - self.check_grad_with_place(place, ['X'], 'Out', in_place=True) - - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [2, 5, 4, 4] - self.out_h = 3 - self.out_w = 3 - self.scale = 0. - self.out_size = [3, 3] - self.align_corners = True - - -# out_size is a tensor list -class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 3, 9, 6] - self.out_h = 12 - self.out_w = 12 - self.scale = 0. - self.out_size = [8, 12] - self.align_corners = True - - -# out_size is a 1-D tensor -class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 0. - self.out_size = np.array([66, 40]).astype("int32") - self.align_corners = True - self.shape_by_1Dtensor = True - - -# scale is a 1-D tensor -class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): - def init_test_case(self): - self.interp_method = 'nearest' - self.input_shape = [3, 2, 32, 16] - self.out_h = 64 - self.out_w = 32 - self.scale = 2.0 - self.out_size = None - self.align_corners = True - self.scale_by_1Dtensor = True + input_np = np.random.random(self.input_shape).astype(self.dtype) + self.inputs = {'X': input_np} + + if self.scale_by_1Dtensor: + self.inputs['Scale'] = np.array([self.scale]).astype("float32") + elif self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + scale_h = scale_w = float(self.scale) + if isinstance(self.scale, list) and len(self.scale) == 1: + scale_w = scale_h = self.scale[0] + elif isinstance(self.scale, list) and len(self.scale) > 1: + scale_w = self.scale[1] + scale_h = self.scale[0] + out_h = int(self.input_shape[2] * scale_h) + out_w = int(self.input_shape[3] * scale_w) + else: + out_h = self.out_h + out_w = self.out_w + + if self.shape_by_1Dtensor: + self.inputs['OutSize'] = self.out_size + elif self.out_size is not None: + size_tensor = [] + for index, ele in enumerate(self.out_size): + size_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + self.inputs['SizeTensor'] = size_tensor + + self.attrs['out_h'] = self.out_h + self.attrs['out_w'] = self.out_w + if self.scale: + if isinstance(self.scale, float) or isinstance(self.scale, int): + if self.scale > 0: + self.scale = [self.scale] + if isinstance(self.scale, list) and len(self.scale) == 1: + self.scale = [self.scale[0], self.scale[0]] + self.attrs['scale'] = self.scale + output_np = nearest_neighbor_interp_np( + input_np, out_h, out_w, 0, 0, self.out_size, self.actual_shape, + self.align_corners) + self.outputs = {'Out': output_np} + + def init_dtype(self): + self.dtype = self.in_type + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out', in_place=True) + + def init_test_case(self): + self.input_shape = [2, 5, 4, 4] + self.out_h = 3 + self.out_w = 3 + self.out_size = [3, 3] + + # out_size is a tensor list + class TestNearestInterp_attr_tensor_Case1(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 3, 9, 6] + self.out_h = 12 + self.out_w = 12 + self.out_size = [8, 12] + + # out_size is a 1-D tensor + class TestNearestInterp_attr_tensor_Case2(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.out_size = np.array([66, 40]).astype("int32") + self.shape_by_1Dtensor = True + + # scale is a 1-D tensor + class TestNearestInterp_attr_tensor_Case3(TestNearestInterpOp_attr_tensor): + def init_test_case(self): + self.input_shape = [3, 2, 32, 16] + self.out_h = 64 + self.out_w = 32 + self.scale = 2.0 + self.out_size = None + self.scale_by_1Dtensor = True + + +support_types = get_xpu_op_support_types('nearest_interp_v2') +for stype in support_types: + create_test_class(globals(), XPUNearestInterpOpWrapper, stype) if __name__ == "__main__": unittest.main() From 79a32715b9aca4a6e522ffcf91bac82e7a6cd380 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Mon, 7 Mar 2022 17:24:16 +0800 Subject: [PATCH 07/50] [OpTest] Support to test paddle API end-to-end for check_eager (#40169) * add python api test in TestOp * test_python_api if self.python_api is set * fix code by CR --- paddle/fluid/imperative/tracer.cc | 33 +++++++ paddle/fluid/imperative/tracer.h | 5 + paddle/fluid/pybind/imperative.cc | 21 +++++ .../paddle/fluid/tests/unittests/op_test.py | 94 +++++++++++++++++++ .../fluid/tests/unittests/test_selu_op.py | 1 + 5 files changed, 154 insertions(+) diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 4336a5c77c178..01c9d2847e0c8 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -18,12 +18,14 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/common/place.h" DECLARE_bool(use_mkldnn); DECLARE_string(tracer_mkldnn_ops_on); @@ -382,5 +384,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, return false; } +phi::KernelSignature Tracer::GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); + framework::RuntimeContext ctx({}, {}); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(phi::CPUPlace()); + const auto& op_info = op->Info(); + auto* attr_checker = op_info.Checker(); + if (attr_checker) { + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); + } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + auto dygraph_exe_ctx = + imperative::DygraphExecutionContext( + *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + default_attrs); + auto* opbase_with_kernel = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + platform::errors::InvalidArgument( + "This op type:`%s` is not a OperatorWithKernel, only " + "OperatorWithKernel can get KernelSignature", + type)); + return phi::KernelSignature( + std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx))); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 73ecbbe6143ca..fd13fce6a6e17 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace imperative { @@ -154,6 +155,10 @@ class Tracer { } } + phi::KernelSignature GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 3da17b95a66ba..9b373a58181f1 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -56,6 +56,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/pybind_boost_headers.h" #include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/tensor_py.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace pybind { @@ -2073,6 +2074,26 @@ void BindImperative(py::module *m_ptr) { *(imperative::AmpOperators::Instance().GetMutableAllowOps()), *(imperative::AmpOperators::Instance().GetMutableBlockOps())); }) + .def("_get_kernel_signature", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs) { + // TODO(xiongkun): move this function outside of tracer. + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + auto to_vector = [](paddle::SmallVector &vec) { + return std::vector(vec.begin(), vec.end()); + }; + auto ret = self.GetExpectedKernelSignature(type, ins_map, + outs_map, attrs); + auto kernelsig_ins = to_vector(std::get<0>(ret.args)); + auto kernelsig_attrs = to_vector(std::get<1>(ret.args)); + auto kernelsig_outs = to_vector(std::get<2>(ret.args)); + return std::make_tuple(kernelsig_ins, kernelsig_attrs, + kernelsig_outs); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 628791afef5f6..0c7f269a087b8 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -29,6 +29,7 @@ import paddle import paddle.fluid as fluid +from paddle.fluid.framework import _dygraph_tracer import paddle.fluid.core as core from paddle.fluid.framework import _in_eager_mode from paddle.fluid.framework import _test_eager_guard @@ -395,6 +396,7 @@ def is_xpu_op(self): hasattr(self, "attrs") and "use_xpu" in self.attrs and self.attrs["use_xpu"] == True) + # set the self.output_dtype . def infer_dtype_from_inputs_outputs(self, inputs, outputs): def is_np_data(input): return isinstance(input, (np.ndarray, np.generic)) @@ -679,6 +681,91 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict): else: return var_dict + def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place): + """ for quick verify, here we take a simplest strategy: + 1. we only check variable in api_outs. + 2. we simply check the numpy (tensor) . + 3. we set atol and rtol as 1e-5, because they are unrelated to dtype. + """ + for name in api_outs: + np_api = np.array(api_outs[name]) + np_dyg = np.array(dygraph_outs[name]) + self.assertTrue( + np.allclose( + np_api, np_dyg, equal_nan=False), + "Output (" + name + ") has diff at " + str(place) + "\nExpect " + + str(np_dyg) + "\n" + "But Got" + str(np_api) + " in class " + + self.__class__.__name__) + + def _calc_python_api_output(self, place): + def prepare_python_api_arguments(op_proto_ins, op_proto_attrs, + kernel_sig): + """ map from `op proto inputs and attrs` to `api input list and api attrs dict` + """ + # NOTE(xiongkun): why don't use input arguments dicts ? + # Because we don't know the python api name of each arguments. + inputs_sig, attrs_sig, outputs_sig = kernel_sig + input_arguments = [op_proto_ins[name] for name in inputs_sig] + attr_arguments = { + name: op_proto_attrs[name] + for name in attrs_sig if name in op_proto_attrs + } + return input_arguments, attr_arguments + + def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): + if not isinstance(ret_tuple, (tuple, list)): + ret_tuple = [ret_tuple] + assert len(output_sig) == len( + ret_tuple), "expect %d outputs, but get %d outputs" % ( + len(output_sig), len(ret_tuple)) + return {a: b for a, b in zip(output_sig, ret_tuple)} + + def assumption_assert_and_transform(args, argvs): + """ + currently only support "X" is [Tensor], don't support multi-tensor in "X" + """ + for inp in args: + assert isinstance(inp, list) and len( + inp + ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`" + args = [inp[0] for inp in args] + return args, argvs + + def cal_python_api(python_api, args, argvs, kernel_sig): + args, argvs = assumption_assert_and_transform(args, argvs) + inputs_sig, attrs_sig, outputs_sig = kernel_sig + ret_tuple = python_api(*args, **argvs) + return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) + + with fluid.dygraph.base.guard(place=place): + block = fluid.default_main_program().global_block() + op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) + # prepare input variable + inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, + True, False, block) + # prepare output variable + outputs = self.append_input_output_for_dygraph( + op_proto, self.outputs, False, False, block) + + # prepare attrbutes + attrs_outputs = {} + if hasattr(self, "attrs"): + for attrs_name in self.attrs: + if self.attrs[attrs_name] is not None: + attrs_outputs[attrs_name] = self.attrs[attrs_name] + + kernel_sig = _dygraph_tracer()._get_kernel_signature( + self.op_type, inputs, outputs, attrs_outputs) + + assert hasattr( + self, "python_api" + ), "Please set the `self.python_api` if you want to compare python api output." + arg, argv = prepare_python_api_arguments(inputs, attrs_outputs, + kernel_sig) + """ we directly return the cal_python_api value because the value is already tensor. + """ + return cal_python_api(self.python_api, arg, argv, kernel_sig) + def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): self.__class__.op_type = self.op_type # for ci check, please not delete it for now with fluid.dygraph.base.guard(place=place): @@ -699,6 +786,7 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): for attrs_name in self.attrs: if self.attrs[attrs_name] is not None: attrs_outputs[attrs_name] = self.attrs[attrs_name] + block.append_op( type=self.op_type, inputs=inputs, @@ -1150,6 +1238,12 @@ def check_output_with_place(self, if check_dygraph: dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) + + if hasattr(self, "python_api"): + api_outs = self._calc_python_api_output(place) + self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, + place) + if check_eager: with _test_eager_guard(): eager_dygraph_outs = self._calc_dygraph_output( diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py index e71adae8d9b6e..f16198817945a 100644 --- a/python/paddle/fluid/tests/unittests/test_selu_op.py +++ b/python/paddle/fluid/tests/unittests/test_selu_op.py @@ -42,6 +42,7 @@ def ref_selu(x, class SeluTest(OpTest): def setUp(self): self.op_type = "selu" + self.python_api = paddle.nn.functional.selu self.x_shape = [3, 5, 5, 10] self.dtype = np.float64 self.init_x_shape() From b798fb071e8f2861f6c59b073f3389ea1d897fde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E6=98=8E=E5=86=AC?= <78149749+winter-wang@users.noreply.github.com> Date: Mon, 7 Mar 2022 21:38:16 +0800 Subject: [PATCH 08/50] [infrt] fold the infrt.cvtTensorOp. test=develop (#40214) --- cmake/external/llvm.cmake | 4 +- paddle/infrt/CMakeLists.txt | 2 - paddle/infrt/dialect/infrt/CMakeLists.txt | 2 + .../infrt/dialect/infrt/pass/CMakeLists.txt | 7 +++ .../infrt/dialect/infrt/pass/infrt_op_fuse.td | 23 ++++++++ .../dialect/infrt/pass/infrt_op_fuse_pass.cc | 52 +++++++++++++++++++ .../dialect/infrt/pass/infrt_op_fuse_pass.h | 24 +++++++++ paddle/infrt/dialect/pd_op_base.td | 2 +- paddle/infrt/dialect/phi/phi_ir_exec.cc | 2 + .../dialect/{pten => phi}/dense_tensor.mlir | 0 .../pten_pass.mlir => phi/phi_pass.mlir} | 0 tools/infrt/custom_pdop.td | 2 +- 12 files changed, 114 insertions(+), 6 deletions(-) create mode 100644 paddle/infrt/dialect/infrt/pass/CMakeLists.txt create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc create mode 100644 paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h rename paddle/infrt/tests/dialect/{pten => phi}/dense_tensor.mlir (100%) rename paddle/infrt/tests/dialect/{pten/pten_pass.mlir => phi/phi_pass.mlir} (100%) diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index a7a9e85ffd731..9f6fd32ad986c 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -100,8 +100,8 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") - add_public_tablegen_target(${td_base}_IncGen) - add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) + add_public_tablegen_target(MLIR${td_base}IncGen) + add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() # Execute the mlir script with infrt-exec program. diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index f2768f3dfa88d..ed29b5b44c779 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -95,9 +95,7 @@ set(infrt_mlir_incs dense_tensor_inc pd_ops_inc pd_extra_ops_inc - rewrite_inc trt_ops_inc - pd_lower_to_trt_inc ) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/infrt/CMakeLists.txt b/paddle/infrt/dialect/infrt/CMakeLists.txt index daf710e0baf54..08ce2d4707bfd 100644 --- a/paddle/infrt/dialect/infrt/CMakeLists.txt +++ b/paddle/infrt/dialect/infrt/CMakeLists.txt @@ -13,3 +13,5 @@ mlir_tablegen(infrt_opsAttributes.h.inc -gen-attrdef-decls -dialect=infrt) mlir_tablegen(infrt_opsAttributes.cpp.inc -gen-attrdef-defs -dialect=infrt) add_public_tablegen_target(MLIRinfrt_opsAttributesIncGen) add_dependencies(mlir-headers MLIRinfrt_opsAttributesIncGen) + +add_subdirectory(pass) diff --git a/paddle/infrt/dialect/infrt/pass/CMakeLists.txt b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt new file mode 100644 index 0000000000000..19c12251a2e6b --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/CMakeLists.txt @@ -0,0 +1,7 @@ +core_gather_headers() + +gather_srcs(infrt_src SRCS + infrt_op_fuse_pass.cc + ) + +mlir_add_rewriter(infrt_op_fuse) diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td new file mode 100644 index 0000000000000..ef702650b6f1b --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse.td @@ -0,0 +1,23 @@ +#ifndef INFRT_OP_FUSE +#define INFRT_OP_FUSE + +include "mlir/Interfaces/SideEffectInterfaces.td" +include "paddle/infrt/dialect/infrt/infrt_ops.td" +include "paddle/infrt/dialect/pd_ops.td" + +def FuseCvtTensorPattern : Pat< + (Infrt_CvtTensorOp (Infrt_CvtTensorOp $arg)), + (Infrt_CvtTensorOp $arg)>; + +def FuseFeedCvtTensorPattern : Pat< + (Infrt_CvtTensorOp (PD_FeedOp $name)), + (PD_FeedOp $name)>; + +def TypesAreIdentical : Constraint>; +def RedundantCvtTensorOptPattern : Pat< + (Infrt_CvtTensorOp:$res $arg), (replaceWithValue $arg), + [(TypesAreIdentical $res, $arg)]>; + + + +#endif // INFRT_OP_FUSE diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc new file mode 100644 index 0000000000000..cb16e054418b3 --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" + +#include +#include "paddle/infrt/dialect/infrt/infrt_dialect.h" +#include "paddle/infrt/dialect/pd_ops.h" +namespace { +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse.cpp.inc" // NOLINT + +/* + * infrtOpFusePass. + */ +struct InfrtOpFusePass + : public mlir::PassWrapper { + public: + ::llvm::StringRef getName() const override { return "infrtOpFusePass"; } + void runOnFunction() override; +}; +// Implementation of the InfrtOpFusePass. +void InfrtOpFusePass::runOnFunction() { + ::mlir::RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + // Fuse pd.return Operation + auto terminator_op = getFunction().front().getTerminator(); + if (nullptr == terminator_op) return; + for (auto operand : terminator_op->getOperands()) { + auto *op1 = operand.getDefiningOp(); + auto cvt_op = ::llvm::dyn_cast<::infrt::CvtTensorOp>(op1); + if (!cvt_op) continue; + mlir::Value value = cvt_op.input(); + operand.replaceAllUsesWith(value); + cvt_op.erase(); + } +} +} // namespace +std::unique_ptr infrt::createInfrtOpFusePass() { + return std::make_unique(); +} diff --git a/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h new file mode 100644 index 0000000000000..ef349a7bbc4c6 --- /dev/null +++ b/paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +namespace infrt { +/* + * infrtOpFusePass. + */ +std::unique_ptr createInfrtOpFusePass(); + +} // namespace infrt diff --git a/paddle/infrt/dialect/pd_op_base.td b/paddle/infrt/dialect/pd_op_base.td index 266bdf60de788..26425e3945caa 100644 --- a/paddle/infrt/dialect/pd_op_base.td +++ b/paddle/infrt/dialect/pd_op_base.td @@ -75,7 +75,7 @@ def PD_ElementType : Type; def PD_Tensor1 : TensorOf<[PD_ElementType]>; -def PD_Tensor : AnyTypeOf<[PD_Tensor1, LoDTensor],"pd.ttype">; +def PD_Tensor : AnyTypeOf<[PD_Tensor1, LoDTensor, DenseTensor],"pd.ttype">; def PD_Tensor_Array : VectorOf<[PD_Tensor]>; diff --git a/paddle/infrt/dialect/phi/phi_ir_exec.cc b/paddle/infrt/dialect/phi/phi_ir_exec.cc index 1df929895b1c7..559fb90a64a78 100644 --- a/paddle/infrt/dialect/phi/phi_ir_exec.cc +++ b/paddle/infrt/dialect/phi/phi_ir_exec.cc @@ -16,6 +16,7 @@ #include #include #include "paddle/infrt/common/global.h" +#include "paddle/infrt/dialect/infrt/pass/infrt_op_fuse_pass.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/phi/pass/phi_op_cvt_pass.h" @@ -38,6 +39,7 @@ int main(int argc, char** argv) { infrt::PrecisionType::FLOAT32, infrt::LayoutType::NCHW}}; phi_pass_manager.addPass(std::make_unique(valid_places)); + phi_pass_manager.addPass(infrt::createInfrtOpFusePass()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; return 4; diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/phi/dense_tensor.mlir similarity index 100% rename from paddle/infrt/tests/dialect/pten/dense_tensor.mlir rename to paddle/infrt/tests/dialect/phi/dense_tensor.mlir diff --git a/paddle/infrt/tests/dialect/pten/pten_pass.mlir b/paddle/infrt/tests/dialect/phi/phi_pass.mlir similarity index 100% rename from paddle/infrt/tests/dialect/pten/pten_pass.mlir rename to paddle/infrt/tests/dialect/phi/phi_pass.mlir diff --git a/tools/infrt/custom_pdop.td b/tools/infrt/custom_pdop.td index 83e2957831296..2139fbd8155bb 100644 --- a/tools/infrt/custom_pdop.td +++ b/tools/infrt/custom_pdop.td @@ -1,4 +1,4 @@ -def PD_FeedOp : PD_Op<"feed"> { +def PD_FeedOp : PD_Op<"feed", [NoSideEffect]> { let summary = "Feed Op"; let description = [{ From 10325a82e1032c3397b6f6611f558eb18ede0b07 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 09:55:10 +0800 Subject: [PATCH 09/50] add python profiler package (#40065) * add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage --- paddle/fluid/platform/profiler.cc | 4 + paddle/fluid/platform/profiler.h | 1 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 85 ++++ python/paddle/fluid/core.py | 2 + .../fluid/tests/unittests/test_newprofiler.py | 129 +++++ python/paddle/profiler/__init__.py | 26 + python/paddle/profiler/profiler.py | 469 ++++++++++++++++++ python/paddle/profiler/profiler_statistic.py | 31 ++ python/paddle/profiler/utils.py | 90 ++++ python/setup.py.in | 1 + 11 files changed, 839 insertions(+), 1 deletion(-) create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler.py create mode 100644 python/paddle/profiler/__init__.py create mode 100644 python/paddle/profiler/profiler.py create mode 100644 python/paddle/profiler/profiler_statistic.py create mode 100644 python/paddle/profiler/utils.py diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 866bf3c66aa2a..feb72bce72bf8 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -489,6 +489,10 @@ void NvprofDisableRecordEvent() { g_enable_nvprof_hook = false; } void EnableHostEventRecorder() { FLAGS_enable_host_event_recorder_hook = true; } +void DisableHostEventRecorder() { + FLAGS_enable_host_event_recorder_hook = false; +} + std::string PrintHostEvents() { std::ostringstream oss; auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents(); diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 122e19b7c2808..78275341cbbf7 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -216,6 +216,7 @@ void NvprofEnableRecordEvent(); void NvprofDisableRecordEvent(); void EnableHostEventRecorder(); +void DisableHostEventRecorder(); // Defined for UT std::string PrintHostEvents(); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5e61133510d6a..7ff501ef43df7 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_ feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store) + cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store new_profiler) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0a1cf604d2e8a..fcfc3e6a3797d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -78,6 +78,9 @@ limitations under the License. */ #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_python.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" #include "paddle/fluid/pybind/distributed_py.h" #include "paddle/fluid/pybind/eager.h" @@ -2913,6 +2916,88 @@ All parameter, weight, gradient are variables in Paddle. }); m.def("size_of_dtype", framework::SizeOfType); + py::class_(m, "_ProfilerResult") + .def(py::init<>()) + .def("get_data", &paddle::platform::ProfilerResult::GetData, + py::return_value_policy::automatic_reference) + .def("save", &paddle::platform::ProfilerResult::Save) + .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); + + py::class_(m, "DevicePythonNode") + .def(py::init<>()) + .def_readwrite("name", &paddle::platform::DevicePythonNode::name) + .def_readwrite("type", &paddle::platform::DevicePythonNode::type) + .def_readwrite("start_ns", &paddle::platform::DevicePythonNode::start_ns) + .def_readwrite("end_ns", &paddle::platform::DevicePythonNode::end_ns) + .def_readwrite("device_id", + &paddle::platform::DevicePythonNode::device_id) + .def_readwrite("context_id", + &paddle::platform::DevicePythonNode::context_id) + .def_readwrite("stream_id", + &paddle::platform::DevicePythonNode::stream_id); + + py::class_(m, "HostPythonNode") + .def(py::init<>()) + .def_readwrite("name", &paddle::platform::HostPythonNode::name) + .def_readwrite("type", &paddle::platform::HostPythonNode::type) + .def_readwrite("start_ns", &paddle::platform::HostPythonNode::start_ns) + .def_readwrite("end_ns", &paddle::platform::HostPythonNode::end_ns) + .def_readwrite("process_id", + &paddle::platform::HostPythonNode::process_id) + .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("children_node", + &paddle::platform::HostPythonNode::children_node_ptrs) + .def_readwrite("runtime_node", + &paddle::platform::HostPythonNode::runtime_node_ptrs) + .def_readwrite("device_node", + &paddle::platform::HostPythonNode::device_node_ptrs); + + py::class_(m, "_Profiler") + .def("create", &paddle::platform::Profiler::Create, + py::return_value_policy::take_ownership) + .def("prepare", + [](paddle::platform::Profiler *profiler) { + platform::EnableHostEventRecorder(); + profiler->Prepare(); + }) + .def("start", &paddle::platform::Profiler::Start) + .def("stop", + [](paddle::platform::Profiler *profiler) { + platform::DisableHostEventRecorder(); + return profiler->Stop(); + }, + py::return_value_policy::automatic_reference); + + py::class_(m, "ProfilerOptions") + .def(py::init<>()) + .def_readwrite("trace_switch", + &paddle::platform::ProfilerOptions::trace_switch); + + py::class_(m, "_RecordEvent") + .def(py::init([](std::string name, platform::TracerEventType type) { + return std::make_unique( + name, type, 1, paddle::platform::EventRole::kOrdinary); + })) + .def("end", [](platform::RecordEvent *event) { event->End(); }); + + py::enum_(m, "TracerEventType") + .value("Operator", paddle::platform::TracerEventType::Operator) + .value("Dataloader", paddle::platform::TracerEventType::Dataloader) + .value("ProfileStep", paddle::platform::TracerEventType::ProfileStep) + .value("CudaRuntime", paddle::platform::TracerEventType::CudaRuntime) + .value("Kernel", paddle::platform::TracerEventType::Kernel) + .value("Memcpy", paddle::platform::TracerEventType::Memcpy) + .value("Memset", paddle::platform::TracerEventType::Memset) + .value("UserDefined", paddle::platform::TracerEventType::UserDefined) + .value("OperatorInner", paddle::platform::TracerEventType::OperatorInner) + .value("Forward", paddle::platform::TracerEventType::Forward) + .value("Backward", paddle::platform::TracerEventType::Backward) + .value("Optimization", paddle::platform::TracerEventType::Optimization) + .value("Communication", paddle::platform::TracerEventType::Communication) + .value("PythonOp", paddle::platform::TracerEventType::PythonOp) + .value("PythonUserDefined", + paddle::platform::TracerEventType::PythonUserDefined); + m.def("load_profiler_result", &paddle::platform::LoadProfilerResult); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("set_cublas_switch", platform::SetAllowTF32Cublas); diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 5e023e9248cab..617ab6305289f 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -283,6 +283,7 @@ def to_list(s): from .core_avx import _set_cached_executor_build_strategy from .core_avx import _device_synchronize from .core_avx import _get_current_stream + from .core_avx import _Profiler, _ProfilerResult, _RecordEvent from .core_avx import _set_current_stream if sys.platform != 'win32': from .core_avx import _set_process_pids @@ -344,6 +345,7 @@ def to_list(s): from .core_noavx import _device_synchronize from .core_noavx import _get_current_stream from .core_noavx import _set_current_stream + from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent if sys.platform != 'win32': from .core_noavx import _set_process_pids from .core_noavx import _erase_process_pids diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler.py b/python/paddle/fluid/tests/unittests/test_newprofiler.py new file mode 100755 index 0000000000000..12fb0fa61b005 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_newprofiler.py @@ -0,0 +1,129 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.profiler as profiler + + +class TestProfiler(unittest.TestCase): + def test_profiler(self): + def my_trace_back(prof): + profiler.export_chrome_tracing('./test_profiler_chrometracing/')( + prof) + profiler.export_protobuf('./test_profiler_pb/')(prof) + + x_value = np.random.randn(2, 3, 3) + x = paddle.to_tensor( + x_value, stop_gradient=False, place=paddle.CPUPlace()) + y = x / 2.0 + ones_like_y = paddle.ones_like(y) + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU], ) as prof: + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=(1, 2)) as prof: + with profiler.RecordEvent(name='test'): + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=0, ready=1, record=1, repeat=1), + on_trace_ready=my_trace_back) as prof: + y = x / 2.0 + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=0, ready=0, record=2, repeat=1), + on_trace_ready=my_trace_back) as prof: + for i in range(3): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN, + on_trace_ready=my_trace_back) as prof: + for i in range(2): + y = x / 2.0 + prof.step() + + def my_sheduler(num_step): + if num_step % 5 < 2: + return profiler.ProfilerState.RECORD_AND_RETURN + elif num_step % 5 < 3: + return profiler.ProfilerState.READY + elif num_step % 5 < 4: + return profiler.ProfilerState.RECORD + else: + return profiler.ProfilerState.CLOSED + + def my_sheduler1(num_step): + if num_step % 5 < 2: + return profiler.ProfilerState.RECORD + elif num_step % 5 < 3: + return profiler.ProfilerState.READY + elif num_step % 5 < 4: + return profiler.ProfilerState.RECORD + else: + return profiler.ProfilerState.CLOSED + + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=lambda x: profiler.ProfilerState.RECORD_AND_RETURN, + on_trace_ready=my_trace_back) as prof: + for i in range(2): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=my_sheduler, + on_trace_ready=my_trace_back) as prof: + for i in range(5): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=my_sheduler1) as prof: + for i in range(5): + y = x / 2.0 + prof.step() + prof = None + with profiler.Profiler( + targets=[profiler.ProfilerTarget.CPU], + scheduler=profiler.make_scheduler( + closed=1, ready=1, record=2, repeat=1, skip_first=1), + on_trace_ready=my_trace_back) as prof: + for i in range(5): + y = x / 2.0 + paddle.grad(outputs=y, inputs=[x], grad_outputs=ones_like_y) + prof.step() + + prof.export(path='./test_profiler_pb.pb', format='pb') + prof.summary() + result = profiler.utils.load_profiler_result('./test_profiler_pb.pb') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/__init__.py b/python/paddle/profiler/__init__.py new file mode 100644 index 0000000000000..4999e703f2a5a --- /dev/null +++ b/python/paddle/profiler/__init__.py @@ -0,0 +1,26 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .profiler import ProfilerState, ProfilerTarget +from .profiler import make_scheduler, export_chrome_tracing, export_protobuf +from .profiler import Profiler +from .profiler import TracerEventType +from .utils import RecordEvent, load_profiler_result +from .profiler_statistic import SortedKeys + +__all__ = [ + 'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler', + 'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent', + 'load_profiler_result', 'SortedKeys' +] diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py new file mode 100644 index 0000000000000..dc637bf983046 --- /dev/null +++ b/python/paddle/profiler/profiler.py @@ -0,0 +1,469 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import socket +import datetime +from enum import Enum +from typing import Any, Callable, Iterable, Optional, Union +from warnings import warn + +import paddle +from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, + TracerEventType) + +from .utils import RecordEvent, wrap_optimizers +from .profiler_statistic import SortedKeys + + +class ProfilerState(Enum): + r""" + Profiler state that can be specified to control profiler action. + + CLOSED: The profilers are closed. + READY: The profilers are open, but the data will not be recorded. + This state is used for reducing overhead influence when profilers start. + RECORD: The profilers are open, and the data will be recorded. + RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period, + the collected data will be returned. + """ + CLOSED = 0 + READY = 1 + RECORD = 2 + RECORD_AND_RETURN = 3 # the last step of RECORD + + +class ProfilerTarget(Enum): + r""" + Target device for profiling. + """ + CPU = 0 + GPU = 1 + + +def make_scheduler(*, + closed: int, + ready: int, + record: int, + repeat: int=0, + skip_first: int=0) -> Callable: + r""" + Return a scheduler function, which scheduler the state according to the setting. + The state transform confirms to: + + (CLOSED) (CLOSED) (CLOSED) (READY) (RECORD,last RETURN) (CLOSED) + START -> skip_first -> closed -> ready -> record -> END + | | + | | (if has_repeated < repeat) + - - - - - - - - - - - - + Note that repeat <= 0 means the cycle will continue until the profiler exits. + + Parameters: + closed(int): The number of steps in state ProfilerState.CLOSED. + ready(int): The number of steps in state ProfilerState.READY. + record(int): The number of steps in state ProfilerState.RECORD. + repeat(int): The number of cycles to repeat above state transform. + skip_first(int): The number of first steps to drop, not participate in the state transform. + + Returns: + A scheduler function, conforms to above state transform setting. + + Examples: + 1. profiling range [2, 5] + batch 0: closed, batch 1: ready, batch [2, 5] record + .. code-block:: python + make_scheduler(closed=1, ready=1, record=4, repeat=1) + 2. profiling range [3,6], [9,12], [15,18]... + batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat + .. code-block:: python + make_scheduler(closed=1, ready=1, record=4, skip_first=1) + """ + + def getScheduleState(step: int) -> ProfilerState: + assert step >= 0 + if step < skip_first: # within skip_first, just skip + return ProfilerState.CLOSED + step = step - skip_first + period_steps = closed + ready + record + has_repeated = step // period_steps + if repeat > 0 and has_repeated >= repeat: # the period has repeated repeat times, return CLOSED state + return ProfilerState.CLOSED + mod_step = step % period_steps + if mod_step < closed: + return ProfilerState.CLOSED + elif mod_step >= closed and mod_step < closed + ready: + return ProfilerState.READY + else: + if mod_step < period_steps - 1: + return ProfilerState.RECORD + else: + return ProfilerState.RECORD_AND_RETURN + assert closed >= 0 and ready >= 0 and record > 0 and \ + repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments" + if ready == 0: + warn("Profiler will record data after enabling profiler immediately, \ + some data collected at the beginning of profiling may be 'noisy' because of overhead." + ) + return getScheduleState + + +def _default_state_scheduler(step: int): + r""" + A default state scheduler, keep recording from the begining of the profiler until ending. + """ + return ProfilerState.RECORD + + +def export_chrome_tracing(dir_name: str, + worker_name: Optional[str]=None) -> Callable: + r""" + Return a callable, used for outputing tracing data to chrome tracing format file. + The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + if worker_name is not set, the default name is [hostname]_[pid]. + + Parameters: + dir_name(str): Directory to save profiling data. + worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + """ + if not os.path.exists(dir_name): + try: + os.makedirs(dir_name, exist_ok=True) + except Exception: + raise RuntimeError( + "Can not create directory '{}' for saving profiling results.". + format(dir_name)) + + def handle_fn(prof): + nonlocal worker_name + if not worker_name: + worker_name = "host_{}pid_{}".format(socket.gethostname(), + str(os.getpid())) + now = datetime.datetime.now() + filename = '{}_time_{}.paddle_trace.json'.format( + worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')) + prof.export(os.path.join(dir_name, filename), "json") + + return handle_fn + + +def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: + r""" + Return a callable, used for outputing tracing data to protobuf file. + The output file will be saved in directory 'dir_name', and file name will be set as worker_name. + if worker_name is not set, the default name is [hostname]_[pid]. + + Parameters: + dir_name(str): Directory to save profiling data. + worker_name(Optional[str]): Prefix of the file name saved, default is [hostname]_[pid]. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (3, 10), + on_trace_ready = profiler.export_protobuf('./log') + ) as p: + for iter in range(N): + train() + p.step() + """ + if not os.path.exists(dir_name): + try: + os.makedirs(dir_name, exist_ok=True) + except Exception: + raise RuntimeError( + "Can not create directory '{}' for saving profiling results.". + format(dir_name)) + + def handle_fn(prof): + nonlocal worker_name + if not worker_name: + worker_name = "host_{}pid_{}".format(socket.gethostname(), + str(os.getpid())) + now = datetime.datetime.now() + filename = '{}_time_{}.paddle_trace.pb'.format( + worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')) + prof.export(os.path.join(dir_name, filename), "pb") + + return handle_fn + + +def _get_supported_targets() -> Iterable[ProfilerTarget]: + r""" + Get the current supported profiler target in the system. + """ + if paddle.device.is_compiled_with_cuda(): + return [ProfilerTarget.CPU, ProfilerTarget.GPU] + return [ProfilerTarget.CPU] + + +class Profiler: + r""" + Profiler context manager, user interface to manage profile process. + + Parameters: + targets (iterable): list of tracing targets, currently supported values: + ``paddle.profiler.ProfilerTarget.CPU``, + ``paddle.profiler.ProfilerTarget.GPU``. + scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``. + If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch, + which means profiling range [start_batch, end_batch). + on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. + This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. + + Examples: + 1. profiling range [2, 5) + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = (2, 5), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + 2. profiling range [2,4], [7, 9], [11,13] + .. code-block:: python + import paddle.profiler as profiler + with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, + profiler.ProfilerTarget.GPU], + scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3), + on_trace_ready = profiler.export_chrome_tracing('./log') + ) as p: + for iter in range(N): + train() + p.step() + 3. Use profiler without context manager, and use default parameters + .. code-block:: python + import paddle.profiler as profiler + p = profiler.Profiler() + p.start() + for iter in range(N): + train() + p.step() + p.stop() + p.summary() + """ + + def __init__( + self, + *, + targets: Optional[Iterable[ProfilerTarget]]=None, + scheduler: Union[Callable[[int], ProfilerState], tuple, None]=None, + on_trace_ready: Optional[Callable[..., Any]]=None): + supported_targets = _get_supported_targets() + if targets: + self.targets = set(targets) + for target in targets: + if target not in supported_targets: + self.targets.remove(target) + warn("Profiling {} is not supported in current context.". + format(target)) + else: + self.targets = supported_targets + profileoption = ProfilerOptions() + if ProfilerTarget.CPU in self.targets: + profileoption.trace_switch |= 1 + if ProfilerTarget.GPU in self.targets: + profileoption.trace_switch |= (1 << 1) + wrap_optimizers() + self.profiler = _Profiler.create(profileoption) + if callable(scheduler): + self.scheduler = scheduler + elif isinstance(scheduler, (tuple, list)): + assert len(scheduler) == 2 and scheduler[1] > scheduler[0] + start_batch, end_batch = scheduler + start_batch = max(start_batch, 0) + if start_batch >= 1: + self.scheduler = make_scheduler( + closed=max(start_batch - 1, 0), + ready=1, + record=(end_batch - start_batch), + repeat=1) + else: + self.scheduler = make_scheduler( + closed=0, + ready=0, + record=(end_batch - start_batch), + repeat=1) + else: + self.scheduler = _default_state_scheduler + + if on_trace_ready == None: + self.on_trace_ready = export_chrome_tracing('./profiler_log/') + else: + self.on_trace_ready = on_trace_ready + self.step_num = 0 + self.previous_state = ProfilerState.CLOSED + self.current_state = self.scheduler(self.step_num) + self.record_event = None + self.profiler_result = None + + def __enter__(self): + self.start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.stop() + + def start(self): + r''' + Start profiler and enter the first profiler step(0). + State transformed from CLOSED to self.current_state and trigger corresponding action. + ''' + # CLOSED -> self.current_state + if self.current_state == ProfilerState.READY: + self.profiler.prepare() + elif self.current_state == ProfilerState.RECORD: + self.profiler.prepare() + self.profiler.start() + elif self.current_state == ProfilerState.RECORD_AND_RETURN: + self.profiler.prepare() + self.profiler.start() + self.record_event = RecordEvent( + name="ProfileStep#{}".format(self.step_num), + event_type=TracerEventType.ProfileStep) + self.record_event.begin() + + def stop(self): + r''' + Stop profiler and State transformed from self.current_state to CLOSED. + Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists. + ''' + # self.current_state -> CLOSED + # In this situation, RECORD state is regarded as RECORD_AND_RETURN + if self.record_event: + self.record_event.end() + self.record_event = None + if self.current_state == ProfilerState.READY: + warn( + "Inproper Profiler state transform: READY->CLOSED, profiler will start and stop without saving data" + ) + self.profiler.start() + self.profiler.stop() + if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN: + self.profiler_result = self.profiler.stop() + if self.on_trace_ready: + self.on_trace_ready(self) + + def step(self): + r""" + Signals the profiler that the next profiling step has started. + Get the new ProfilerState and trigger corresponding action. + """ + if self.record_event: + self.record_event.end() + self.record_event = None + self.previous_state = self.current_state + self.step_num += 1 + self.current_state = self.scheduler(self.step_num) + self._trigger_action() + self.record_event = RecordEvent( + name="ProfileStep#{}".format(self.step_num), + event_type=TracerEventType.ProfileStep) + self.record_event.begin() + + def _trigger_action(self): + if self.previous_state == ProfilerState.CLOSED: + if self.current_state == ProfilerState.READY: # CLOSED -> READY + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD: # CLOSED -> RECORD + self.profiler.prepare() + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # CLOSED -> RECORD_AND_RETURN + self.profiler.prepare() + self.profiler.start() + + elif self.previous_state == ProfilerState.READY: + if self.current_state == ProfilerState.CLOSED: # READY -> CLOSED + warn( + "Improper schedule: READY->CLOSED, profiler will start and stop without saving data" + ) + self.profiler.start() + self.profiler.stop() + if self.current_state == ProfilerState.RECORD: # READY -> RECORD + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # READY -> RECORD_AND_RETURN + self.profiler.start() + + elif self.previous_state == ProfilerState.RECORD: + if self.current_state == ProfilerState.CLOSED: # RECORD -> CLOSED + warn( + "Improper schedule: RECORD->CLOSED, profiler will not saving data" + ) + self.profiler.stop() + + if self.current_state == ProfilerState.READY: # RECORD -> READY + warn( + "Improper schedule: RECORD->READY, profiler will stop and re-prepare" + ) + self.profiler.stop() + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # RECORD -> RECORD_AND_RETURN + pass + + else: + assert self.previous_state == ProfilerState.RECORD_AND_RETURN + if self.current_state == ProfilerState.CLOSED: # RECORD_AND_RETURN -> CLOSED + self.profiler_result = self.profiler.stop() + if self.current_state == ProfilerState.READY: # RECORD_AND_RETURN -> READY + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + if self.current_state == ProfilerState.RECORD: # RECORD_AND_RETURN -> RECORD + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + self.profiler.start() + if self.current_state == ProfilerState.RECORD_AND_RETURN: # RECORD_AND_RETURN -> RECORD_AND_RETURN + self.profiler_result = self.profiler.stop() + self.profiler.prepare() + self.profiler.start() + if self.on_trace_ready: + self.on_trace_ready(self) + + def export(self, path="", format="json"): + r""" + Exports the tracing data in Chrome tracing data format. + """ + if self.profiler_result: + self.profiler_result.save(path, format) + + def summary(self, + sorted_by=SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms'): + r""" + Print the Summary table. + + Parameters: + sorted_by: how to rank the op table items. + detail: expand each operator detail information. + thread_sep: print op table each thread. + time_unit: can be chosen form ['s', 'ms', 'us', 'ns'] + """ + pass diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py new file mode 100644 index 0000000000000..29d586268a014 --- /dev/null +++ b/python/paddle/profiler/profiler_statistic.py @@ -0,0 +1,31 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +from enum import Enum + +from paddle.fluid.core import TracerEventType + + +class SortedKeys(Enum): + r""" + Sorted keys for printing summary table. + """ + CPUTotal = 0 + CPUAvg = 1 + CPUMax = 2 + CPUMin = 3 + GPUTotal = 4 + GPUAvg = 5 + GPUMax = 6 + GPUMin = 7 diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py new file mode 100644 index 0000000000000..642001dfbfc5a --- /dev/null +++ b/python/paddle/profiler/utils.py @@ -0,0 +1,90 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import (_RecordEvent, TracerEventType, + load_profiler_result) +from typing import Any +from warnings import warn +import functools +from contextlib import ContextDecorator + +_AllowedEventTypeList = [ + TracerEventType.Dataloader, TracerEventType.ProfileStep, + TracerEventType.UserDefined, TracerEventType.Forward, + TracerEventType.Backward, TracerEventType.Optimization, + TracerEventType.PythonOp, TracerEventType.PythonUserDefined +] + + +class RecordEvent(ContextDecorator): + r""" + Interface for recording a time range. + + Parameters: + name(str): Name of the record event + event_type(TracerEventType): Type of the record event, can be used for statistics. + + Examples: + .. code-block:: python + import paddle.profiler as profiler + with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined): + op1() + """ + + def __init__(self, + name: str, + event_type: TracerEventType=TracerEventType.UserDefined): + self.name = name + self.event_type = event_type + self.event = None + + def __enter__(self): + self.begin() + return self + + def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any): + self.end() + + def begin(self): + if self.event_type not in _AllowedEventTypeList: + warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ + can be recorded.".format(*_AllowedEventTypeList)) + self.event = None + else: + if self.event_type == TracerEventType.UserDefined: + self.event_type == TracerEventType.PythonUserDefined + self.event = _RecordEvent(self.name, self.event_type) + + def end(self): + if self.event: + self.event.end() + + +def wrap_optimizers(): + def optimizer_warpper(func): + @functools.wraps(func) + def warpper(*args, **kwargs): + with RecordEvent( + 'Optimization Step', + event_type=TracerEventType.Optimization): + return func(*args, **kwargs) + + return warpper + + import paddle.optimizer as optimizer + for classname in optimizer.__all__: + if classname != 'Optimizer': + classobject = getattr(optimizer, classname) + if getattr(classobject, 'step', None) != None: + classobject.step = optimizer_warpper(classobject.step) diff --git a/python/setup.py.in b/python/setup.py.in index 0bc32cfbc00bd..118f617361fc4 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -372,6 +372,7 @@ packages=['paddle', 'paddle.device', 'paddle.device.cuda', 'paddle.version', + 'paddle.profiler' ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: From 1f857cb966191e3e43de7950918595a6a4ca2db2 Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 09:55:54 +0800 Subject: [PATCH 10/50] add profiler statistic helper (#40111) * add profiler helper * fix unittest * improve test coverage rate --- .../unittests/test_newprofiler_helper.py | 137 +++++++++++ python/paddle/profiler/statistic_helper.py | 225 ++++++++++++++++++ 2 files changed, 362 insertions(+) create mode 100755 python/paddle/fluid/tests/unittests/test_newprofiler_helper.py create mode 100644 python/paddle/profiler/statistic_helper.py diff --git a/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py new file mode 100755 index 0000000000000..05e7920035456 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_newprofiler_helper.py @@ -0,0 +1,137 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle.profiler.statistic_helper as statistic_helper + + +class TestStatisticHelper(unittest.TestCase): + def test_sum_ranges_case1(self): + src = [(1, 3), (4, 10), (11, 15)] + self.assertEqual(statistic_helper.sum_ranges(src), 12) + + def test_sum_ranges_case2(self): + src = [(3, 3), (5, 5), (7, 7)] + self.assertEqual(statistic_helper.sum_ranges(src), 0) + + def test_merge_self_ranges_case1(self): + src = [(1, 5), (2, 7), (4, 9), (14, 19)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 9), (14, 19)]) + src = [(4, 9), (14, 19), (1, 5), (2, 7)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 9), (14, 19)]) + + def test_merge_self_ranges_case2(self): + src = [(1, 1), (2, 3), (4, 7), (5, 12)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)]) + src = [(5, 12), (1, 1), (2, 3), (4, 7)] + dst = statistic_helper.merge_self_ranges(src) + self.assertEqual(dst, [(1, 1), (2, 3), (4, 12)]) + + def test_merge_ranges_case1(self): + src1 = [(1, 2), (5, 7), (9, 14)] + src2 = [(1, 2), (4, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (4, 15)]) + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, [(1, 2), (4, 15)]) + src1 = [] + src2 = [] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, []) + src1 = [(1, 2), (3, 5)] + src2 = [] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, src1) + src1 = [] + src2 = [(1, 2), (3, 5)] + dst = statistic_helper.merge_ranges(src1, src2, True) + self.assertEqual(dst, src2) + src1 = [(3, 4), (1, 2), (17, 19)] + src2 = [(6, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)]) + dst = statistic_helper.merge_ranges(src2, src1) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 9), (13, 15), (17, 19)]) + src1 = [(1, 2), (5, 9), (12, 13)] + src2 = [(6, 8), (9, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (5, 15)]) + dst = statistic_helper.merge_ranges(src2, src1) + self.assertEqual(dst, [(1, 2), (5, 15)]) + + def test_merge_ranges_case2(self): + src1 = [(3, 4), (1, 2), (9, 14)] + src2 = [(6, 9), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (3, 4), (6, 15)]) + src2 = [(9, 14), (1, 2), (5, 7)] + src1 = [(4, 9), (1, 2), (13, 15)] + dst = statistic_helper.merge_ranges(src1, src2) + self.assertEqual(dst, [(1, 2), (4, 15)]) + + def test_intersection_ranges_case1(self): + src1 = [(1, 7), (9, 12), (14, 18)] + src2 = [(3, 8), (10, 13), (15, 19)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + src1 = [] + src2 = [] + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, []) + src1 = [(3, 7), (10, 12)] + src2 = [(2, 9), (11, 13), (15, 19)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (11, 12)]) + dst = statistic_helper.intersection_ranges(src2, src1) + self.assertEqual(dst, [(3, 7), (11, 12)]) + + def test_intersection_ranges_case2(self): + src2 = [(9, 12), (1, 7), (14, 18)] + src1 = [(10, 13), (3, 8), (15, 19), (20, 22)] + dst = statistic_helper.intersection_ranges(src1, src2) + self.assertEqual(dst, [(3, 7), (10, 12), (15, 18)]) + src2 = [(1, 7), (14, 18), (21, 23)] + src1 = [(6, 9), (10, 13)] + dst = statistic_helper.intersection_ranges(src1, src2, True) + self.assertEqual(dst, [(6, 7)]) + + def test_subtract_ranges_case1(self): + src1 = [(1, 10), (12, 15)] + src2 = [(3, 7), (9, 11)] + dst = statistic_helper.subtract_ranges(src1, src2, True) + self.assertEqual(dst, [(1, 3), (7, 9), (12, 15)]) + src1 = [(1, 10), (12, 15)] + src2 = [] + dst = statistic_helper.subtract_ranges(src1, src2, True) + self.assertEqual(dst, src1) + dst = statistic_helper.subtract_ranges(src2, src1, True) + self.assertEqual(dst, src2) + + def test_subtract_ranges_case2(self): + src2 = [(12, 15), (1, 10)] + src1 = [(9, 11), (3, 7)] + dst = statistic_helper.subtract_ranges(src1, src2) + self.assertEqual(dst, [(10, 11)]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/statistic_helper.py b/python/paddle/profiler/statistic_helper.py new file mode 100644 index 0000000000000..1f11649928a7f --- /dev/null +++ b/python/paddle/profiler/statistic_helper.py @@ -0,0 +1,225 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections + + +def sum_ranges(ranges): + result = 0 + for time_range in ranges: + result += (time_range[1] - time_range[0]) + return result + + +def merge_self_ranges(src_ranges, is_sorted=False): + merged_ranges = [] + if len(src_ranges) > 0: + if not is_sorted: + src_ranges.sort(key=lambda x: x[0]) + cur_indx = 0 + merged_ranges.append((src_ranges[cur_indx][0], src_ranges[cur_indx][1])) + for cur_indx in range(1, len(src_ranges)): + if src_ranges[cur_indx][1] > merged_ranges[-1][1]: + if src_ranges[cur_indx][0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], + src_ranges[cur_indx][1]) + else: + merged_ranges.append( + (src_ranges[cur_indx][0], src_ranges[cur_indx][1])) + return merged_ranges + + +def merge_ranges(range_list1, range_list2, is_sorted=False): + merged_ranges = [] + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + len1 = len(range_list1) + len2 = len(range_list2) + if len1 == 0 and len2 == 0: + return merged_ranges + elif len1 == 0: + return range_list2 + elif len2 == 0: + return range_list1 + else: + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + if range1[0] < range2[0]: + merged_ranges.append(range1) + indx1 += 1 + else: + merged_ranges.append(range2) + indx2 += 1 + while indx1 < len1 and indx2 < len2: + range1 = range_list1[indx1] + range2 = range_list2[indx2] + if range1[0] < range2[0]: + if range1[1] > merged_ranges[-1][1]: + if range1[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range1[1]) + else: + merged_ranges.append((range1[0], range1[1])) + indx1 += 1 + else: + indx1 += 1 + else: + if range2[1] > merged_ranges[-1][1]: + if range2[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) + else: + merged_ranges.append((range2[0], range2[1])) + indx2 += 1 + else: + indx2 += 1 + + while indx1 < len1: + range1 = range_list1[indx1] + if range1[1] > merged_ranges[-1][1]: + if range1[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range1[1]) + else: + merged_ranges.append((range1[0], range1[1])) + indx1 += 1 + else: + indx1 += 1 + while indx2 < len2: + if range2[1] > merged_ranges[-1][1]: + if range2[0] <= merged_ranges[-1][1]: + merged_ranges[-1] = (merged_ranges[-1][0], range2[1]) + else: + merged_ranges.append((range2[0], range2[1])) + indx2 += 1 + else: + indx2 += 1 + return merged_ranges + + +def intersection_ranges(range_list1, range_list2, is_sorted=False): + result_range = [] + if len(range_list1) == 0 or len(range_list2) == 0: + return result_range + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + + len1 = len(range_list1) + len2 = len(range_list2) + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + while indx1 < len1 and indx2 < len2: + if range2[1] <= range1[0]: + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] <= range1[0] and range2[1] < range1[1]: + assert (range2[1] > range1[0]) + result_range.append((range1[0], range2[1])) + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] <= range1[0]: + assert (range2[1] >= range1[1]) + result_range.append(range1) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + + elif range2[1] < range1[1]: + assert (range2[0] > range1[0]) + result_range.append(range2) + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 == len2: + break + range2 = range_list2[indx2] + + elif range2[0] < range1[1]: + assert (range2[1] >= range1[1]) + result_range.append((range2[0], range1[1])) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + + else: + assert (range2[0] >= range1[1]) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + return result_range + + +def subtract_ranges(range_list1, range_list2, is_sorted=False): + result_range = [] + if not is_sorted: + range_list1 = merge_self_ranges(range_list1) + range_list2 = merge_self_ranges(range_list2) + if len(range_list1) == 0: + return result_range + if len(range_list2) == 0: + return range_list1 + + len1 = len(range_list1) + len2 = len(range_list2) + indx1 = 0 + indx2 = 0 + range1 = range_list1[indx1] + range2 = range_list2[indx2] + + while indx1 < len(range_list1): + if indx2 == len(range_list2): + result_range.append(range1) + indx1 += 1 + if indx1 == len1: + break + range1 = range_list1[indx1] + elif range2[1] <= range1[0]: + indx2 += 1 + if indx2 != len2: + range2 = range_list2[indx2] + elif range2[0] <= range1[0] and range2[1] < range1[1]: + range1 = (range2[1], range1[1]) + indx2 += 1 + if indx2 != len2: + range2 = range_list2[indx2] + elif range2[0] <= range1[0]: + assert (range2[1] >= range1[1]) + range2 = (range1[1], range2[1]) + indx1 += 1 + if indx1 != len1: + range1 = range_list1[indx1] + elif range2[0] < range1[1]: + assert (range2[0] > range1[0]) + result_range.append((range1[0], range2[0])) + range1 = (range2[0], range1[1]) + else: + assert (range2[0] >= range1[1]) + result_range.append(range1) + indx1 += 1 + if indx1 != len1: + range1 = range_list1[indx1] + return result_range From fe1cc8bd43472f6b9eb413a6ae88144517b9bf8a Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:01:54 +0800 Subject: [PATCH 11/50] [phi] move sigmoid_cross_entopy_with_logits log_loss cumsum auc infershape to phi (#40200) * move infershapes to phi * update code format * update code format --- paddle/fluid/operators/cumsum_op.cc | 20 ++--- paddle/fluid/operators/log_loss_op.cc | 45 ++-------- paddle/fluid/operators/metrics/auc_op.cc | 72 ++-------------- .../sigmoid_cross_entropy_with_logits_op.cc | 50 ++--------- paddle/phi/infermeta/binary.cc | 83 +++++++++++++++++++ paddle/phi/infermeta/binary.h | 14 ++++ paddle/phi/infermeta/multiary.cc | 80 ++++++++++++++++++ paddle/phi/infermeta/multiary.h | 12 +++ paddle/phi/infermeta/unary.cc | 18 ++++ paddle/phi/infermeta/unary.h | 7 ++ 10 files changed, 244 insertions(+), 157 deletions(-) diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 7c80917a71369..11633fb0b8703 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 2e596ff3e6257..883e3597d8a31 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 54ecba08a82dc..f3ed98c3f4d1e 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,4 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 8e502fc04dbdb..016ff54645b02 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,46 +29,6 @@ const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 55230aa8d0516..b17405990fb72 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -575,6 +575,48 @@ void GatherTreeMeta(const MetaTensor& ids, out->set_dims(ids_dims); } +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config) { + auto pred_dims = input.dims(); + auto label_dims = label.dims(); + + if (config.is_runtime || + (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { + PADDLE_ENFORCE_EQ( + pred_dims, + label_dims, + phi::errors::InvalidArgument( + "The dimensions of Input(Predicted) must be equal to the" + "dimensions of Input(Labels), but received dimensions of " + "Input(Predicted)" + "is [%s], received dimensions of Input(Labels) is [%s].", + pred_dims, + label_dims)); + } + PADDLE_ENFORCE_EQ(pred_dims.size(), + 2, + phi::errors::InvalidArgument( + "The dimensions of Input(Predicted) must be 2," + "But received dimensions of Input(Predicted)" + "is [%d]", + pred_dims.size())); + if (config.is_runtime) { + PADDLE_ENFORCE_EQ(pred_dims[1], + 1, + phi::errors::InvalidArgument( + "Each row of Input(Predicted) contains a real value, " + "so the 2nd dimension of Input(X) must be 1," + "But got [%d]", + pred_dims[1])); + } + out->set_dims({pred_dims[0], 1}); + out->set_dtype(input.dtype()); + out->share_lod(input); +} + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { auto dim_x = x.dims(); auto dim_vec = vec.dims(); @@ -605,4 +647,45 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) { out->share_lod(x); } +void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, + const MetaTensor& label, + bool normalize, + int ignore_index, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto labels_dims = label.dims(); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, + labels_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + labels_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ( + phi::slice_ddim(x_dims, 0, rank), + phi::slice_ddim(labels_dims, 0, rank), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension. But received: the shape of " + "Input(X) is [%s], the shape of Input(Label) is [%s].", + x_dims, + labels_dims)); + } + + out->set_dims(x_dims); + out->set_dtype(x.dtype()); + out->share_lod(x); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 106c22f7548bd..934ed688bf2df 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -89,6 +89,7 @@ void BincountInferMeta(const MetaTensor& x, const paddle::optional weights, int minlength, MetaTensor* out); + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -102,6 +103,19 @@ void GatherTreeMeta(const MetaTensor& ids, const MetaTensor& parents, MetaTensor* out); +void LogLossInferMeta(const MetaTensor& input, + const MetaTensor& label, + float epsilon, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out); +void SigmoidCrossEntropyWithLogitsInferMeta(const MetaTensor& x, + const MetaTensor& label, + bool normalize, + int ignore_index, + MetaTensor* out, + MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index a21f077c09f09..acce40713b821 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -28,6 +28,86 @@ std::vector GetMetaTensorsDim(const std::vector& tensors) { return dims; } +void AucInferMeta(const MetaTensor& input, + const MetaTensor& label, + const MetaTensor& stat_pos, + const MetaTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + MetaTensor* auc, + MetaTensor* stat_pos_out, + MetaTensor* stat_neg_out, + MetaConfig config) { + auto predict_dims = input.dims(); + auto label_dims = label.dims(); + PADDLE_ENFORCE_GE( + predict_dims.size(), + 2, + phi::errors::InvalidArgument( + "The Input(Predict) has not been initialized properly. The " + "shape of Input(Predict) = [%s], the shape size must be " + "greater_equal 2.", + predict_dims)); + auto predict_width = predict_dims[1]; + PADDLE_ENFORCE_NE( + phi::product(predict_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Predict) has not been initialized properly. The " + "shape of Input(Predict) = [%s], the shape can not involes 0.", + predict_dims)); + PADDLE_ENFORCE_NE( + phi::product(label_dims), + 0, + phi::errors::InvalidArgument( + "The Input(Label) has not been initialized properly. The " + "shape of Input(Label) = [%s], the shape can not involes 0.", + label_dims)); + if (config.is_runtime) { + PADDLE_ENFORCE_LE( + predict_width, + 2, + phi::errors::InvalidArgument("Only support binary classification," + "prediction dims[1] should be 1 or 2")); + } + auto predict_height = input.dims()[0]; + auto label_height = label.dims()[0]; + + if (config.is_runtime) { + PADDLE_ENFORCE_EQ( + predict_height, + label_height, + phi::errors::InvalidArgument("Out and Label should have same height.")); + } + + int num_pred_buckets = num_thresholds + 1; + + PADDLE_ENFORCE_GE( + num_pred_buckets, + 1, + phi::errors::InvalidArgument("num_thresholds must larger than 1")); + PADDLE_ENFORCE_GE( + slide_steps, + 0, + phi::errors::InvalidArgument("slide_steps must be natural number")); + + auc->set_dims({1}); + auc->set_dtype(DataType::INT64); + + if (slide_steps) { + stat_pos_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1}); + stat_pos_out->set_dtype(DataType::INT64); + stat_neg_out->set_dims({(1 + slide_steps) * num_pred_buckets + 1}); + stat_neg_out->set_dtype(DataType::INT64); + } else { + stat_pos_out->set_dims({1, num_pred_buckets}); + stat_pos_out->set_dtype(DataType::INT64); + stat_neg_out->set_dims({1, num_pred_buckets}); + stat_neg_out->set_dtype(DataType::INT64); + } +} + void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& learning_rate, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 8cb6f70481de3..26bdc62302f18 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -20,6 +20,18 @@ namespace phi { std::vector GetMetaTensorsDim(const std::vector& tensors); +void AucInferMeta(const MetaTensor& input, + const MetaTensor& label, + const MetaTensor& stat_pos, + const MetaTensor& stat_neg, + const std::string& curve, + int num_thresholds, + int slide_steps, + MetaTensor* auc, + MetaTensor* stat_pos_out, + MetaTensor* stat_neg_out, + MetaConfig config = MetaConfig()); + void BilinearTensorProductInferMeta(const MetaTensor& x, const MetaTensor& y, const MetaTensor& weight, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index b9eb5196b1e8f..4053cfbc362e3 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -156,6 +156,24 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) { out->set_layout(x.layout()); } +void CumsumInferMeta(const MetaTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + MetaTensor* out) { + auto x_dims = x.dims(); + if (flatten) { + out->set_dims(phi::make_ddim({phi::product(x_dims)})); + out->set_dtype(x.dtype()); + } else { + out->set_dims(x_dims); + out->set_dtype(x.dtype()); + } + + out->share_lod(x); +} + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { PADDLE_ENFORCE_EQ( product(x.dims()), diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 37b17f6e3d182..a679ef8c11af6 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -63,6 +63,13 @@ void CopyToInferMeta(const MetaTensor& x, void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); +void CumsumInferMeta(const MetaTensor& x, + int axis, + bool flatten, + bool exclusive, + bool reverse, + MetaTensor* out); + void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); void InferMetaFromVecValue(const MetaTensor& x, From 0c33c47ee752befb54b6a16f6608cb3c411506d9 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 8 Mar 2022 10:21:48 +0800 Subject: [PATCH 12/50] fix paddle.median torch diff (#40118) --- python/paddle/tensor/stat.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index 468aa46048627..dd0da03e4fd28 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -321,6 +321,9 @@ def median(x, axis=None, keepdim=False, name=None): paddle.slice( tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]), dtype=dtype) + out_tensor = out_tensor + paddle.sum( + paddle.cast( + paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True) if not keepdim or is_flatten: if not is_flatten: newshape = x.shape[:axis] + x.shape[axis + 1:] From 81d4142b97e3758f7a526066dd0414ec8b306098 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:23:01 +0800 Subject: [PATCH 13/50] [Phi] move InferShape for truncated_gaussian_random and gaussian_random (#40191) * [Phi] move InferShape for truncated_gaussian_random and gaussian_random * [Phi] delete useless codes --- paddle/fluid/operators/gaussian_random_op.cc | 47 +++++-------------- .../operators/truncated_gaussian_random_op.cc | 36 +++++--------- paddle/phi/infermeta/nullary.cc | 25 ++++++++++ paddle/phi/infermeta/nullary.h | 14 ++++++ .../cpu/truncated_gaussian_random_kernel.cc | 2 +- .../gpu/truncated_gaussian_random_kernel.cu | 3 +- .../truncated_gaussian_random_kernel.h | 5 +- 7 files changed, 70 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 6b559885c569d..66eecc13d04d1 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfdbe..dc5a66dce16d6 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/phi/infermeta/nullary.cc b/paddle/phi/infermeta/nullary.cc index 0c48c9d0c7eae..506d3fd14ea3f 100644 --- a/paddle/phi/infermeta/nullary.cc +++ b/paddle/phi/infermeta/nullary.cc @@ -40,4 +40,29 @@ void EyeInferMeta(int64_t num_rows, out->set_dims({num_rows, num_columns}); out->set_dtype(dtype); } + +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + +void GaussianRandomInferMeta(const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out) { + auto out_dims = phi::make_ddim(shape.GetData()); + out->set_dims(out_dims); + out->set_dtype(dtype); + out->set_layout(DataLayout::NCHW); +} + } // namespace phi diff --git a/paddle/phi/infermeta/nullary.h b/paddle/phi/infermeta/nullary.h index 40d6ea595c0c9..bd0567486e4d6 100644 --- a/paddle/phi/infermeta/nullary.h +++ b/paddle/phi/infermeta/nullary.h @@ -40,4 +40,18 @@ void EyeInferMeta(int64_t num_rows, DataType dtype, MetaTensor* out); +void TruncatedGaussianRandomInferMeta(const std::vector& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + +void GaussianRandomInferMeta(const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc index ebc032ef54538..4247e597acef4 100644 --- a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -27,7 +27,7 @@ namespace phi { template void TruncatedGaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const std::vector& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu index 12c1bf791e169..f27b32ca7b831 100644 --- a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -25,7 +25,6 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/fluid/framework/generator.h" -// #include "paddle/phi/core/generator.h" namespace phi { @@ -87,7 +86,7 @@ struct TruncatedNormalOffset { template void TruncatedGaussianRandomKernel(const Context& dev_ctx, - const ScalarArray& shape, + const std::vector& shape, float mean, float std, int seed, diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h index 0370cc431fef9..f8547ced41934 100644 --- a/paddle/phi/kernels/truncated_gaussian_random_kernel.h +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -20,6 +20,7 @@ #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/device_context.h" +#include "paddle/phi/infermeta/nullary.h" namespace phi { @@ -157,8 +158,8 @@ struct TruncatedNormal { }; template -void TruncatedGaussianRandomKernel(const Context& ctx, - const ScalarArray& shape, +void TruncatedGaussianRandomKernel(const Context& dev_ctx, + const std::vector& shape, float mean, float std, int seed, From 413a743e7f5e0436db60b7d1718cc0353488062a Mon Sep 17 00:00:00 2001 From: tanzhipeng <51696454+tiancaitzp@users.noreply.github.com> Date: Tue, 8 Mar 2022 10:29:32 +0800 Subject: [PATCH 14/50] remove unnecessary constant fill in sequence conv test=kunlun. (#40126) --- .../fluid/operators/sequence_ops/sequence_conv_op_xpu.cc | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044b2..23c6a0133e1ed 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, From 6bd2d2b1cb5fa2e350adb4c9b291b48054257be5 Mon Sep 17 00:00:00 2001 From: wawltor Date: Tue, 8 Mar 2022 10:29:59 +0800 Subject: [PATCH 15/50] [Phi] move the graph_send_recv op to the phi (#40092) * [Phi] transfer old kernel to pten kernel for the graph_send_recv op * update the code for the define of graph_send_recv * fix the gradient problem for graph_send_recv * fix the compile problem * update the enfore message for the windows * update the code for the compiler * update compiler problem for the windows * udpate the code for windows * fix some format problem --- paddle/fluid/operators/graph_send_recv_op.cc | 12 +- paddle/fluid/operators/graph_send_recv_op.cu | 419 ------------------ paddle/fluid/operators/graph_send_recv_op.h | 291 ------------ .../phi/kernels/cpu/graph_send_recv_funcs.h | 80 ++++ .../cpu/graph_send_recv_grad_kernel.cc | 172 +++++++ .../phi/kernels/cpu/graph_send_recv_kernel.cc | 153 +++++++ .../phi/kernels/gpu/graph_send_recv_funcs.h | 171 +++++++ .../gpu/graph_send_recv_grad_kernel.cu | 148 +++++++ .../phi/kernels/gpu/graph_send_recv_kernel.cu | 179 ++++++++ .../phi/kernels/graph_send_recv_grad_kernel.h | 33 ++ paddle/phi/kernels/graph_send_recv_kernel.h | 31 ++ paddle/phi/ops/compat/graph_send_recv_sig.cc | 31 ++ 12 files changed, 999 insertions(+), 721 deletions(-) delete mode 100644 paddle/fluid/operators/graph_send_recv_op.cu delete mode 100644 paddle/fluid/operators/graph_send_recv_op.h create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_funcs.h create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/graph_send_recv_kernel.cc create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_funcs.h create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/graph_send_recv_kernel.cu create mode 100644 paddle/phi/kernels/graph_send_recv_grad_kernel.h create mode 100644 paddle/phi/kernels/graph_send_recv_kernel.h create mode 100644 paddle/phi/ops/compat/graph_send_recv_sig.cc diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eba4..b759345eda565 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -171,13 +171,3 @@ REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvGradOpMaker, ops::GraphSendRecvGradOpMaker); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac38..0000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee845..0000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/graph_send_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h new file mode 100644 index 0000000000000..df6d9c87be0ed --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_funcs.h @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +struct GraphSendRecvSumFunctor { + void operator()(const bool& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + eigen_dst += eigen_src; + } +}; + +template +struct GraphSendRecvMinFunctor { + void operator()(const bool& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + if (first_flag) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMin(eigen_src); + } + } +}; + +template +struct GraphSendRecvMaxFunctor { + void operator()(const int& first_flag, + const DenseTensor& src_slice, + DenseTensor* dst_slice) { + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(*dst_slice); + if (first_flag) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMax(eigen_src); + } + } +}; + +template +void ElementwiseInnerOperation(const DenseTensor& src, + DenseTensor* dst, + const IndexT& src_index, + const IndexT& dst_index, + const bool& first_flag, + Functor functor) { + auto src_slice = src.Slice(src_index, src_index + 1); + auto dst_slice = dst->Slice(dst_index, dst_index + 1); + + functor(first_flag, src_slice, &dst_slice); +} + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc new file mode 100644 index 0000000000000..8538461b1b83b --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" + +#include +#include + +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvCpuGradLoop(const int& input_size, + const int& index_size, + const IndexT* s_index, + const IndexT* d_index, + const DenseTensor& src, + DenseTensor* dst, + const std::string& pool_type, + const int* dst_count = nullptr, + const DenseTensor* input = nullptr, + const DenseTensor* output = nullptr) { + if (pool_type == "SUM") { + Functor functor; + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + auto src_slice = src.Slice(src_idx, src_idx + 1); + auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + for (int i = 0; i < index_size; ++i) { + const IndexT& forward_src_idx = d_index[i]; + const IndexT& forward_dst_idx = s_index[i]; + auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); + auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); + auto eigen_input = phi::EigenVector::Flatten(input_slice); + auto eigen_output = phi::EigenVector::Flatten(output_slice); + + auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); + auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); + auto eigen_src = phi::EigenVector::Flatten(src_slice); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst += eigen_src * (eigen_output == eigen_input); + } + } +} + +template +void GraphSendRecvGradOpKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* x = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* p_output = x_grad->data(); + const auto& src_dims = out_grad.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + if (index_size == 0) return; + + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + + if (pool_type == "SUM") { + GraphSendRecvCpuGradLoop>( + src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type); + } else if (pool_type == "MEAN") { + const int* s_count = dst_count->data(); + // Functor not used here. + GraphSendRecvCpuGradLoop>(src_dims[0], + index_size, + d_index, + s_index, + out_grad, + x_grad, + pool_type, + s_count); + } else if (pool_type == "MIN" || pool_type == "MAX") { + // Functor not used here. + GraphSendRecvCpuGradLoop>(src_dims[0], + index_size, + d_index, + s_index, + out_grad, + x_grad, + pool_type, + nullptr, + x, + out); + } +} + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvGradOpKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvGradOpKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv_grad, + CPU, + ALL_LAYOUT, + phi::GraphSendRecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc new file mode 100644 index 0000000000000..fecbd4b1d7aa0 --- /dev/null +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -0,0 +1,153 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/graph_send_recv_kernel.h" +#include "paddle/phi/kernels/cpu/graph_send_recv_funcs.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvCpuLoop(const int& input_size, + const int& index_size, + const IndexT* s_index, + const IndexT* d_index, + const DenseTensor& src, + DenseTensor* dst, + const std::string& pool_type, + int* dst_count = nullptr) { + Functor functor; + if (pool_type == "SUM") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + for (int i = 0; i < index_size; ++i) { + IndexT dst_idx = d_index[i]; + *(dst_count + dst_idx) += 1; + } + for (int i = 0; i < input_size; ++i) { + if (*(dst_count + i) == 0) continue; + auto dst_slice = dst->Slice(i, i + 1); + auto eigen_dst = phi::EigenVector::Flatten(dst_slice); + eigen_dst = eigen_dst / static_cast(*(dst_count + i)); + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + std::set existed_dst; + for (int i = 0; i < index_size; ++i) { + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; + bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); + if (!in_set) { + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, true, functor); + existed_dst.emplace(dst_idx); + } else { + ElementwiseInnerOperation( + src, dst, src_idx, dst_idx, false, functor); + } + } + } +} + +template +void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + + ctx.template Alloc(out); + T* p_output = out->data(); + const auto& src_dims = x.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + if (index_size == 0) return; + + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + if (pool_type == "SUM") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MIN") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MAX") { + GraphSendRecvCpuLoop>( + src_dims[0], index_size, s_index, d_index, x, out, pool_type); + } else if (pool_type == "MEAN") { + ctx.template Alloc(dst_count); + int* p_dst_count = dst_count->data(); + memset(p_dst_count, 0, src_dims[0] * sizeof(int)); + GraphSendRecvCpuLoop>(src_dims[0], + index_size, + s_index, + d_index, + x, + out, + pool_type, + p_dst_count); + } +} + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvOpKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvOpKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv, + CPU, + ALL_LAYOUT, + phi::GraphSendRecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h new file mode 100644 index 0000000000000..1eab521170bc5 --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_funcs.h @@ -0,0 +1,171 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/phi/kernels/graph_send_recv_kernel.h" + +#include +#include +#include +#include + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" + +namespace phi { + +template +struct GraphSendRecvSumCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); + } +}; + +template +struct GraphSendRecvMaxCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); + } +}; + +template +struct GraphSendRecvMinCUDAFunctor { + DEVICE inline void operator()(const T* params, + T* output, + const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); + } +}; + +template +__global__ void GraphSendRecvCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + Functor functor) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + functor(params, output, in_i, out_i); + } +} + +// For max +template +__global__ void InputResetMaxCUDAKernel(T* output, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::min()) { + *(output + i) = 0; + } + } +} + +// For min +template +__global__ void InputResetMinCUDAKernel(T* output, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::max()) { + *(output + i) = 0; + } + } +} + +// Get dst_count +template +__global__ void ComputeCountCUDAKernel(int32_t* count, + const IndexT* dst_indices, + size_t index_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { + IndexT dst_i = dst_indices[i]; + paddle::platform::CudaAtomicAdd(count + dst_i, 1); + } +} + +// For forward mean +template +__global__ void ManipulateMeanCUDAKernel(T* output, + int32_t* count, + size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + int64_t c_index = i / slice_size; + if (*(count + c_index) > 1) { + *(output + i) = *(output + i) / *(count + c_index); + } + } +} + +// For backward mean +template +__global__ void ManipulateMeanGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const int32_t* dst_count) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd(output + out_i, + *(params + in_i) / dst_count[src_i]); + } +} + +// For backward min and max +template +__global__ void ManipulateMinMaxGradCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, + T* output, + size_t index_size, + size_t slice_size, + const T* ptr_input, + const T* ptr_output) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd( + output + out_i, + *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu new file mode 100644 index 0000000000000..75692966b4662 --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu @@ -0,0 +1,148 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_recv_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvGradOpCUDAKernelLaunchHelper( + const Context& ctx, + const DenseTensor& out_grad, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* x_grad, + const DenseTensor* dst_count = nullptr, + const DenseTensor* x = nullptr, + const DenseTensor* out = nullptr) { + const int& index_size = dst_index.dims()[0]; + + ctx.template Alloc(x_grad); + T* p_output = x_grad->data(); + + const auto& src_dims = out_grad.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else + cudaMemset(p_output, 0, memset_bytes); +#endif + + if (index_size == 0) return; + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = out_grad.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif + int64_t n = slice_size * index_size; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, d_index, s_index, p_output, index_size, slice_size, functor); + } else if (pool_type == "MEAN") { + const int32_t* s_count = dst_count->data(); + ManipulateMeanGradCUDAKernel<<>>( + p_src, d_index, s_index, p_output, index_size, slice_size, s_count); + } else if (pool_type == "MAX" || pool_type == "MIN") { + const T* ptr_input = x->data(); + const T* ptr_output = out->data(); + ManipulateMinMaxGradCUDAKernel<<>>( + p_src, + d_index, + s_index, + p_output, + index_size, + slice_size, + ptr_input, + ptr_output); + } +} + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvGradOpCUDAKernelLaunchHelper( + ctx, + out_grad, + src_index, + dst_index, + pool_type, + x_grad, + dst_count.get_ptr(), + x.get_ptr(), + out.get_ptr()); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv_grad, + GPU, + ALL_LAYOUT, + phi::GraphSendRecvGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu new file mode 100644 index 0000000000000..fab306f831a6f --- /dev/null +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -0,0 +1,179 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h" +#include "paddle/phi/kernels/graph_send_recv_kernel.h" + +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count = nullptr) { + const int& index_size = src_index.dims()[0]; + ctx.template Alloc(out); + T* p_output = out->data(); + const auto& src_dims = x.dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else + cudaMemset(p_output, 0, memset_bytes); +#endif + } else if (pool_type == "MAX") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, + p_output_ptr, + p_output_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, + p_output_ptr, + p_output_ptr + memset_size, + std::numeric_limits::max()); + } + + if (index_size == 0) return; + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = x.data(); + const IndexT* s_index = src_index.data(); + const IndexT* d_index = dst_index.data(); + +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif + int64_t n = slice_size * index_size; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + } else if (pool_type == "MAX") { + GraphSendRecvMaxCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvMaxCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_max = + grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; + InputResetMaxCUDAKernel<<>>( + p_output, input_size, slice_size); + } else if (pool_type == "MIN") { + GraphSendRecvMinCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvMinCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_min = + grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; + InputResetMinCUDAKernel<<>>( + p_output, input_size, slice_size); + } else if (pool_type == "MEAN") { + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel< + T, + IndexT, + GraphSendRecvSumCUDAFunctor><<>>( + p_src, s_index, d_index, p_output, index_size, slice_size, functor); + + ctx.template Alloc(dst_count); + int32_t* p_dst_count = dst_count->data(); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_dst_count, 0, input_size * sizeof(int)); +#else + cudaMemset(p_dst_count, 0, input_size * sizeof(int)); +#endif + + int64_t grid_count = (index_size + block - 1) / block; + ComputeCountCUDAKernel<<>>( + p_dst_count, d_index, index_size); + + int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_mean = + grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; + ManipulateMeanCUDAKernel<<>>( + p_output, p_dst_count, input_size, slice_size); + } +} + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count) { + auto index_type = src_index.dtype(); + if (index_type == phi::DataType::INT32) { + GraphSendRecvOpCUDAKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } else if (index_type == phi::DataType::INT64) { + GraphSendRecvOpCUDAKernelLaunchHelper( + ctx, x, src_index, dst_index, pool_type, out, dst_count); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(graph_send_recv, + GPU, + ALL_LAYOUT, + phi::GraphSendRecvKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h new file mode 100644 index 0000000000000..d163e6e278a07 --- /dev/null +++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void GraphSendRecvGradKernel(const Context& ctx, + const DenseTensor& out_grad, + paddle::optional x, + paddle::optional out, + const DenseTensor& src_index, + const DenseTensor& dst_index, + paddle::optional dst_count, + const std::string& pool_type, + DenseTensor* x_grad); +} // namespace phi diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h new file mode 100644 index 0000000000000..95dbdc4443ad0 --- /dev/null +++ b/paddle/phi/kernels/graph_send_recv_kernel.h @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void GraphSendRecvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& src_index, + const DenseTensor& dst_index, + const std::string& pool_type, + DenseTensor* out, + DenseTensor* dst_count); + +} // namespace phi diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc new file mode 100644 index 0000000000000..dacb8b25a89f9 --- /dev/null +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GraphSendRecvGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "graph_send_recv_grad", + {GradVarName("Out"), "X", "Out", "Src_index", "Dst_index", "Dst_count"}, + {"pool_type"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, + phi::GraphSendRecvGradOpArgumentMapping); From c722ee690dd75389bf000cd5435f5f4519c4b7a2 Mon Sep 17 00:00:00 2001 From: maxhuiy <1508399706@qq.com> Date: Tue, 8 Mar 2022 10:35:27 +0800 Subject: [PATCH 16/50] [MLU] add fleet init api and collective api pytest for mlu (#40010) * [MLU] add fleet init api and collective api pytest for mlu * fix no value for argument 'data_type' in method call --- python/paddle/distributed/collective.py | 4 + python/paddle/distributed/parallel.py | 14 +- python/paddle/fluid/dygraph/parallel.py | 3 + .../fluid/tests/unittests/mlu/CMakeLists.txt | 12 +- .../tests/unittests/mlu/c_comm_init_op_mlu.py | 71 ++++++ .../unittests/mlu/collective_allreduce_api.py | 54 +++++ .../unittests/mlu/collective_broadcast_api.py | 54 +++++ .../unittests/mlu/test_c_comm_init_op_mlu.sh | 21 ++ .../mlu/test_collective_allreduce_api_mlu.py | 43 ++++ .../mlu/test_collective_api_base_mlu.py | 223 ++++++++++++++++++ .../mlu/test_collective_broadcast_api_mlu.py | 43 ++++ 11 files changed, 535 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 8042aced6bbdf..bf6556d21e9fc 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -267,6 +267,10 @@ def new_group(ranks=None, backend=None): place = core.NPUPlace(genv.device_id) core.HCCLParallelContext(strategy, place).init_with_ring_id(ring_id) + elif core.is_compiled_with_mlu(): + place = core.MLUPlace(genv.device_id) + core.CNCLParallelContext(strategy, + place).init_with_ring_id(ring_id) else: assert False, ("no cuda device found") else: diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 177e19194a522..16ed528b64f0c 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -58,9 +58,9 @@ def _start_kv_server(port, http_server_d, size): def _is_cpuonly(backend): check_backend(backend) - if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter'] and ( + if backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and ( core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or - core.is_compiled_with_npu()): + core.is_compiled_with_npu() or core.is_compiled_with_mlu()): # passes 'auto' and can use cuda or xpu, use the default logics. so return False return False @@ -152,7 +152,8 @@ def train(): is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or - core.is_compiled_with_xpu() or core.is_compiled_with_npu()): + core.is_compiled_with_xpu() or core.is_compiled_with_npu() or + core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend") @@ -162,6 +163,8 @@ def train(): _check_var_exists('FLAGS_selected_xpus') elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') + elif not is_cpu_only and core.is_compiled_with_mlu(): + _check_var_exists('FLAGS_selected_mlus') _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") @@ -213,6 +216,8 @@ def train(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) + elif core.is_compiled_with_mlu(): + place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) # init nccl or hccl or bkcl or heter context @@ -231,6 +236,9 @@ def train(): elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) + elif core.is_compiled_with_mlu(): + parallel_helper._set_parallel_ctx( + core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 0049f387b707f..652916491eed7 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -128,6 +128,9 @@ def __init__(self): elif core.is_compiled_with_npu(): selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") self._device_id = int(selected_npus[0]) + elif core.is_compiled_with_mlu(): + selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") + self._device_id = int(selected_mlus[0]) self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index c17790bd3200e..17f5509bdb958 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -13,13 +13,17 @@ if (WITH_MLU) endforeach(TEST_OP) if(WITH_CNCL) - foreach(TEST_OP ${TEST_DIST_OPS}) + foreach(TEST_OP ${TEST_DIST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) - bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_c_comm_init_op_mlu START_BASH test_c_comm_init_op_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) - set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_broadcast_api_mlu PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce_api_mlu PROPERTIES TIMEOUT 120) + set_tests_properties(test_c_comm_init_op_mlu PROPERTIES TIMEOUT 120) endif(WITH_CNCL) endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py new file mode 100644 index 0000000000000..e91f28e3b1db8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/c_comm_init_op_mlu.py @@ -0,0 +1,71 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import os +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.distributed.fleet.base.private_helper_function import wait_server_ready +import paddle + +paddle.enable_static() + + +class TestCCommInitOp(unittest.TestCase): + def setUp(self): + self.endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',') + self.current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + self.nranks = len(self.endpoints) + self.rank = self.endpoints.index(self.current_endpoint) + self.mlu_id = int(os.getenv("FLAGS_selected_mlus")) + self.place = fluid.MLUPlace(self.mlu_id) + self.exe = fluid.Executor(self.place) + self.endpoints.remove(self.current_endpoint) + self.other_endpoints = self.endpoints + if self.rank == 0: + wait_server_ready(self.other_endpoints) + + def test_specifying_devices(self): + program = fluid.Program() + block = program.global_block() + cncl_id_var = block.create_var( + name=fluid.unique_name.generate('cncl_id'), + persistable=True, + type=fluid.core.VarDesc.VarType.RAW) + block.append_op( + type='c_gen_cncl_id', + inputs={}, + outputs={'Out': cncl_id_var}, + attrs={ + 'rank': self.rank, + 'endpoint': self.current_endpoint, + 'other_endpoints': self.other_endpoints + }) + block.append_op( + type='c_comm_init', + inputs={'X': cncl_id_var}, + outputs={}, + attrs={ + 'nranks': self.nranks, + 'rank': self.rank, + 'ring_id': 0, + 'device_id': self.mlu_id + }) + self.exe.run(program) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py new file mode 100644 index 0000000000000..ebe4e71d22fde --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_api.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveAllreduceAPI(TestCollectiveAPIRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program, rank): + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype='float32') + paddle.distributed.all_reduce(tindata) + return [tindata] + + +if __name__ == "__main__": + runtime_main(TestCollectiveAllreduceAPI, "allreduce") diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py new file mode 100644 index 0000000000000..2002909ea2eec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/collective_broadcast_api.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_api_base_mlu import TestCollectiveAPIRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveBroadcastAPI(TestCollectiveAPIRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program, rank): + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype="float32") + paddle.distributed.broadcast(tindata, src=1) + return [tindata] + + +if __name__ == "__main__": + runtime_main(TestCollectiveBroadcastAPI, "broadcast") diff --git a/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh new file mode 100644 index 0000000000000..97f21798c1154 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_c_comm_init_op_mlu.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +# use default values +# FIXME: random fails on Unknown command lines -c (or -m). +launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py +MLU_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op_mlu.py diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py new file mode 100644 index 0000000000000..447498b9022d4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce_api_mlu.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from test_collective_api_base_mlu import TestDistBase + +paddle.enable_static() + + +class TestCollectiveAllreduceAPI(TestDistBase): + def _setup_config(self): + pass + + def test_allreduce_cncl_fp16(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "float16") + + def test_allreduce_cncl_fp32(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "float32") + + def test_allreduce_cncl_int32(self): + self.check_with_place("collective_allreduce_api.py", "allreduce", + "int32") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py new file mode 100644 index 0000000000000..556fc6fcbb75f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py @@ -0,0 +1,223 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import numpy as np +import unittest +import os +import sys +import subprocess +import pickle +from contextlib import closing +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + + +def DataTypeCast(date_type): + np_data_type = None + + if date_type == "float16": + np_data_type = np.float16 + elif date_type == "float32": + np_data_type = np.float32 + elif date_type == "int32": + np_data_type = np.int32 + else: + raise ValueError("This data type is not support!") + + return np_data_type + + +class TestCollectiveAPIRunnerBase(object): + def get_model(self, train_prog, startup_prog, rank, indata=None): + raise NotImplementedError( + "get model should be implemented by child class.") + + def run_trainer(self, args): + train_prog = fluid.Program() + startup_prog = fluid.Program() + endpoints = args["endpoints"].split(",") + rank = args["trainerid"] + current_endpoint = args["currentendpoint"] + nranks = 2 + paddle.distributed.init_parallel_env() + device_id = int(os.getenv("FLAGS_selected_mlus", "0")) + place = fluid.MLUPlace(device_id) + np.random.seed(os.getpid()) + np_data_type = DataTypeCast(args["data_type"]) + indata = np.random.random((10, 1000)).astype(np_data_type) + if args['static_mode']: + result = self.get_model(train_prog, startup_prog, rank) + exe = fluid.Executor(place) + exe.run(startup_prog) + fetch_list = [] + for elem in result: + fetch_list.append(elem.name) + out = exe.run(train_prog, + feed={'tindata': indata}, + fetch_list=fetch_list) + else: + out = self.get_model(train_prog, startup_prog, rank, indata) + #print(out, sys.stderr) + sys.stdout.buffer.write(pickle.dumps(out)) + + +def runtime_main(test_class, col_type): + args = {} + model = test_class() + args["trainerid"] = int(os.getenv("PADDLE_TRAINER_ID")) + args["trainernum"] = int(os.getenv("PADDLE_TRAINERS_NUM")) + args["endpoints"] = os.getenv('PADDLE_TRAINER_ENDPOINTS') + args["currentendpoint"] = os.getenv("PADDLE_CURRENT_ENDPOINT") + args["col_type"] = col_type + args["backend"] = os.getenv("BACKEND") + args["path_id"] = int(os.getenv("PATH_ID")) + args["static_mode"] = int(os.getenv("STATIC_MODE")) + args["data_type"] = os.getenv("DATA_TYPE") + model.run_trainer(args) + + +import paddle.compat as cpt +import socket +from contextlib import closing + + +class TestDistBase(unittest.TestCase): + def setUp(self): + self._port_set = set() + self._trainers = 2 + self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( + self._find_free_port(), self._find_free_port()) + self._python_interp = sys.executable + + def _find_free_port(self): + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port + + def _run_cluster(self, model_file, envs): + worker_endpoints = self._ps_endpoints.split(",") + w0_ep, w1_ep = worker_endpoints + #print("w0_ep:",w0_ep," w1_ep:",w1_ep) + env0 = { + "FLAGS_selected_mlus": "0", + "PADDLE_TRAINER_ID": "0", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w0_ep + } + + env1 = { + "FLAGS_selected_mlus": "1", + "PADDLE_TRAINER_ID": "1", + "PADDLE_TRAINERS_NUM": "2", + "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints, + "PADDLE_CURRENT_ENDPOINT": w1_ep + } + #update environment + env0.update(envs) + env1.update(envs) + if os.getenv('WITH_COVERAGE', 'OFF') == 'ON': + tr_cmd = "%s -m coverage run --branch -p %s" + else: + tr_cmd = "%s %s" + tr0_cmd = tr_cmd % (self._python_interp, model_file) + tr1_cmd = tr_cmd % (self._python_interp, model_file) + tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w") + tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w") + #print(tr0_cmd) + tr0_proc = subprocess.Popen( + tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr0_pipe, + env=env0) + + tr1_proc = subprocess.Popen( + tr0_cmd.strip().split(), + stdout=subprocess.PIPE, + stderr=tr1_pipe, + env=env1) + + tr0_out, tr0_err = tr0_proc.communicate() + tr1_out, tr1_err = tr1_proc.communicate() + sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + # close trainer file + tr0_pipe.close() + tr1_pipe.close() + with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 0 stderr file: %s\n' % f.read()) + with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f: + sys.stderr.write('trainer 1 stderr file: %s\n' % f.read()) + return pickle.loads(tr0_out), pickle.loads( + tr1_out), tr0_proc.pid, tr1_proc.pid + + def check_with_place(self, + model_file, + col_type, + data_type, + path_id="0", + static_mode="1", + check_error_log=False, + need_envs={}): + required_envs = { + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_eager_delete_tensor_gb": "0.0", + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "LD_PRELOAD": os.getenv("LD_PRELOAD", ""), + "FLAGS_call_stack_level": "2", + "GLOG_v": "3", + "STATIC_MODE": static_mode, + "PADDLE_WITH_GLOO": '0', + "BACKEND": "cncl", + "PATH_ID": path_id, + "DATA_TYPE": data_type + } + required_envs.update(need_envs) + if check_error_log: + required_envs["GLOG_v"] = "3" + required_envs["GLOG_logtostderr"] = "1" + required_envs["GLOO_LOG_LEVEL"] = "TRACE" + tr0_out, tr1_out, pid0, pid1 = self._run_cluster(model_file, + required_envs) + np_data_type = DataTypeCast(data_type) + np.random.seed(pid0) + input1 = np.random.random((10, 1000)).astype(np_data_type) + np.random.seed(pid1) + input2 = np.random.random((10, 1000)).astype(np_data_type) + if col_type == "broadcast": + need_result = input2 + self.assertTrue(np.allclose(tr0_out, need_result)) + self.assertTrue(np.allclose(tr1_out, need_result)) + elif col_type == "allreduce": + need_result = input1 + input2 + self.assertTrue( + np.allclose( + tr0_out, need_result, rtol=1e-05, atol=1e-05)) + self.assertTrue( + np.allclose( + tr1_out, need_result, rtol=1e-05, atol=1e-05)) + else: + pass diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py new file mode 100644 index 0000000000000..95919f3332869 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_broadcast_api_mlu.py @@ -0,0 +1,43 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from test_collective_api_base_mlu import TestDistBase + +paddle.enable_static() + + +class TestCollectiveBroadcastAPI(TestDistBase): + def _setup_config(self): + pass + + def test_broadcast_cncl_fp16(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "float16") + + def test_broadcast_cncl_fp32(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "float32") + + def test_broadcast_cncl_int32(self): + self.check_with_place("collective_broadcast_api.py", "broadcast", + "int32") + + +if __name__ == '__main__': + unittest.main() From c39aa18e0d3fe4eddd72ff1d07839655a8af8dbb Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Tue, 8 Mar 2022 10:48:33 +0800 Subject: [PATCH 17/50] [custom kernel]Upgrade support for multiple libs (#40223) * [custom kernel]Upgade support for multi libs * upgrade phi_custom_kernel deps --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/inference/api/CMakeLists.txt | 2 +- paddle/fluid/platform/CMakeLists.txt | 2 +- paddle/fluid/platform/init.cc | 2 +- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/custom_kernel.cc | 71 ++++++++------------- paddle/phi/core/custom_kernel.h | 14 ++-- paddle/phi/core/kernel_registry.h | 3 +- paddle/phi/kernels/CMakeLists.txt | 2 +- paddle/phi/kernels/sparse/CMakeLists.txt | 2 +- paddle/phi/tests/core/CMakeLists.txt | 2 +- paddle/phi/tests/core/test_custom_kernel.cc | 4 +- paddle/testing/CMakeLists.txt | 2 +- 13 files changed, 45 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e486799495c7a..aa92a3b2226c1 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 6eeb5d6425359..1f83e606c3fde 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 5a47443fd0b52..04c8a329e5e1a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -117,7 +117,7 @@ endif() cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # seperate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context phi_custom_kernel) +cc_library(init SRCS init.cc DEPS device_context custom_kernel) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index cf85dede8e846..293a71dbd968c 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -154,8 +154,8 @@ void LoadCustomDevice(const std::string &library_dir) { "Fail to open library: %s with error: %s", lib_path, dlerror())); phi::LoadCustomRuntimeLib(lib_path, dso_handle); - phi::LoadCustomKernelLib(lib_path, dso_handle); } + phi::CustomKernelMap::Instance().RegisterCustomKernels(); LOG(INFO) << "Finished in LoadCustomDevice with libs_path: [" << library_dir << "]"; } diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 424c4ce2ebcc8..b4a6b54d0fe3a 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -25,7 +25,7 @@ cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) -cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils op_registry phi_tensor_raw) +cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index a333874d03ec1..bc317da8d98ed 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -12,21 +12,29 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - #include "paddle/phi/core/custom_kernel.h" namespace phi { -void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { - auto& kernel_info_map = custom_kernel_map.GetMap(); - VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size(); +void CustomKernelMap::RegisterCustomKernel(const std::string& name, + const KernelKey& key, + const Kernel& kernel) { + PADDLE_ENFORCE_EQ(kernels_[name].find(key), + kernels_[name].end(), + phi::errors::AlreadyExists( + "The custom kernel [%s:%s] has been already existed in " + "CustomKernelMap, please check if any duplicate kernel " + "info in your lib(s) before load again.", + name, + key)); + kernels_[name][key] = kernel; +} + +void CustomKernelMap::RegisterCustomKernels() { + VLOG(3) << "Size of custom_kernel_map: " << kernels_.size(); auto& kernels = KernelFactory::Instance().kernels(); - for (auto& pair : kernel_info_map) { + for (auto& pair : kernels_) { PADDLE_ENFORCE_NE( kernels.find(pair.first), kernels.end(), @@ -38,8 +46,8 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { PADDLE_ENFORCE_EQ( kernels[pair.first].find(info_pair.first), kernels[pair.first].end(), - phi::errors::InvalidArgument( - "The operator <%s>'s kernel: %s has been already existed " + phi::errors::AlreadyExists( + "The kernel [%s:%s] has been already existed " "in Paddle, please contribute PR if it is necessary " "to optimize the kernel code. Custom kernel does NOT support " "to replace existing kernel in Paddle.", @@ -48,43 +56,14 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { kernels[pair.first][info_pair.first] = info_pair.second; - VLOG(3) << "Successed in registering operator <" << pair.first - << ">'s kernel: " << info_pair.first - << " to Paddle. It will be used like native ones."; + VLOG(3) << "Successed in registering kernel [" << pair.first << ":" + << info_pair.first + << "] to Paddle. It will be used like native ones."; } + kernels_[pair.first].clear(); } + LOG(INFO) << "Successed in loading custom kernels."; + kernels_.clear(); } -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { -#ifdef _LINUX - typedef phi::CustomKernelMap& get_custom_kernel_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetCustomKernelMap")); - - if (func == nullptr) { - LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetCustomKernelMap symbol in this lib."; - return; - } - auto& custom_kernel_map = func(); - phi::RegisterCustomKernels(custom_kernel_map); - LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; -#else - VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; -#endif - return; -} } // namespace phi - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global CustomKernelMap. -phi::CustomKernelMap& PD_GetCustomKernelMap() { - return phi::CustomKernelMap::Instance(); -} - -#ifdef __cplusplus -} // end extern "C" -#endif diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h index ffd12b9dd03a8..5ba14de6a6131 100644 --- a/paddle/phi/core/custom_kernel.h +++ b/paddle/phi/core/custom_kernel.h @@ -29,6 +29,12 @@ class CustomKernelMap { return g_custom_kernel_info_map; } + void RegisterCustomKernel(const std::string& kernel_name, + const KernelKey& kernel_key, + const Kernel& kernel); + + void RegisterCustomKernels(); + KernelNameMap& Kernels() { return kernels_; } const KernelNameMap& GetMap() const { return kernels_; } @@ -40,12 +46,4 @@ class CustomKernelMap { KernelNameMap kernels_; }; -/** - * Note: - * Used to register custom kernels to KernelFactory. - */ -void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map); - -// Load custom kernel lib and register -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle); } // namespace phi diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h index 6a0c7bbc9b7f5..d9ed68593cd61 100644 --- a/paddle/phi/core/kernel_registry.h +++ b/paddle/phi/core/kernel_registry.h @@ -210,7 +210,8 @@ struct KernelRegistrar { if (reg_type == RegType::INNER) { KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel; } else { - CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel; + CustomKernelMap::Instance().RegisterCustomKernel( + kernel_name, kernel_key, kernel); } } }; diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 16fae8d879cc3..58ea231beef7c 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") # [ 1. Common kernel compilation dependencies ] -set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) +set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils custom_kernel) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col vol2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) diff --git a/paddle/phi/kernels/sparse/CMakeLists.txt b/paddle/phi/kernels/sparse/CMakeLists.txt index a319e9a13c3f7..eaea6d952167c 100644 --- a/paddle/phi/kernels/sparse/CMakeLists.txt +++ b/paddle/phi/kernels/sparse/CMakeLists.txt @@ -1,3 +1,3 @@ -set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function) +set(SPARSE_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel) register_kernels(DEPS ${SPARSE_KERNEL_DEPS} SUB_DIR "sparse_kernel") diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 5356bac9fbd80..de9bd7a4d479c 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index a4e89231e14f8..6fe34a6891a35 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -172,7 +172,9 @@ TEST(CustomKernel, custom_kernel_dot) { fake_dot_kernels.end()); // register - phi::RegisterCustomKernels(phi::CustomKernelMap::Instance()); + phi::CustomKernelMap::Instance().RegisterCustomKernels(); + + EXPECT_EQ(0, static_cast(custom_fake_dot_kernels.size())); EXPECT_TRUE(fake_dot_kernels.find( phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) != diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index eace7c41f4a31..0cc68bf31617c 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -1,5 +1,5 @@ # for paddle test case if(WITH_TESTING) - cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags) + cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags proto_desc) endif() From d4b007af8bfa82df134220690115fcd58122de26 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 8 Mar 2022 10:53:28 +0800 Subject: [PATCH 18/50] add share dims (#40238) --- paddle/fluid/framework/infershape_utils.cc | 20 +++++++------ paddle/phi/core/meta_tensor.cc | 35 +++++++++++++++++----- paddle/phi/core/meta_tensor.h | 3 +- 3 files changed, 41 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 7232a707916dd..91ef59575c3aa 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -232,16 +232,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -254,6 +246,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor + share_lod(meta_tensor); + share_dims(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc index 2aadce4feda96..eb114304f53ea 100644 --- a/paddle/phi/core/meta_tensor.cc +++ b/paddle/phi/core/meta_tensor.cc @@ -98,13 +98,9 @@ const LoD& MetaTensor::lod() const { } void MetaTensor::share_meta(const MetaTensor& meta_tensor) { - if (phi::DenseTensor::classof(tensor_)) { - set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - set_layout(meta_tensor.layout()); - share_lod(meta_tensor); - } else if (phi::SelectedRows::classof(tensor_)) { - set_dims(meta_tensor.dims()); + if (phi::DenseTensor::classof(tensor_) || + phi::SelectedRows::classof(tensor_)) { + share_dims(meta_tensor); set_dtype(meta_tensor.dtype()); set_layout(meta_tensor.layout()); share_lod(meta_tensor); @@ -114,4 +110,29 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) { } } +TensorBase* MetaTensor::get_tensor() const { return tensor_; } + +void MetaTensor::share_dims(const MetaTensor& meta_tensor) { + bool is_dense_tensor = phi::DenseTensor::classof(tensor_); + bool is_selected_rows = phi::SelectedRows::classof(tensor_); + if (is_dense_tensor || is_selected_rows) { + set_dims(meta_tensor.dims()); + if (is_selected_rows) { + const auto in_tensor_base = meta_tensor.get_tensor(); + PADDLE_ENFORCE_EQ( + phi::SelectedRows::classof(in_tensor_base), + true, + errors::InvalidArgument("The input MetaTensor is SelectedRows, but " + "the output MetaTensor is not this type.")); + auto* selected_rows_out = static_cast(tensor_); + auto* selected_rows_in = static_cast(in_tensor_base); + selected_rows_out->set_rows(selected_rows_in->rows()); + selected_rows_out->set_height(selected_rows_in->height()); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported sharing dims for `%s`.", tensor_->type_info().name())); + } +} + } // namespace phi diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h index 1a32019a19049..3971a9f7e99e0 100644 --- a/paddle/phi/core/meta_tensor.h +++ b/paddle/phi/core/meta_tensor.h @@ -60,12 +60,13 @@ class MetaTensor { virtual void share_lod(const MetaTensor& meta_tensor); virtual void share_meta(const MetaTensor& meta_tensor); + virtual void share_dims(const MetaTensor& meta_tensor); private: // Because the lod in compiletime and runtime is different, // so `LoD` cannot in public methods const LoD& lod() const; - + TensorBase* get_tensor() const; TensorBase* tensor_; }; From f876320a9836a6a12ab6e8b3ddb079fc2ae6e746 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 8 Mar 2022 11:15:39 +0800 Subject: [PATCH 19/50] support code auto-gene for sparse backward api (#40196) --- .gitignore | 2 + paddle/phi/api/lib/CMakeLists.txt | 26 ++- .../paddle/utils/code_gen/backward_api_gen.py | 1 + .../paddle/utils/code_gen/sparse_api_gen.py | 9 +- .../paddle/utils/code_gen/sparse_bw_api.yaml | 6 + .../utils/code_gen/sparse_bw_api_gen.py | 200 ++++++++++++++++++ 6 files changed, 235 insertions(+), 9 deletions(-) create mode 100644 python/paddle/utils/code_gen/sparse_bw_api.yaml create mode 100644 python/paddle/utils/code_gen/sparse_bw_api_gen.py diff --git a/.gitignore b/.gitignore index a2009a1ed30a1..21222678f049c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 4f449c578bab0..926ddf8ba49f8 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -40,6 +40,14 @@ set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc) set(sparse_api_header_file_tmp ${api_header_file}.tmp) set(sparse_api_source_file_tmp ${api_source_file}.tmp) +# sparse bw api file +set(sparse_bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api_gen.py) +set(sparse_bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml) +set(sparse_bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h) +set(sparse_bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_bw_api.cc) +set(sparse_bw_api_header_file_tmp ${sparse_bw_api_header_file}.tmp) +set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp) + # wrapped infermeta file set(wrapped_infermeta_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py) set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml) @@ -91,7 +99,20 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_header_file_tmp} ${sparse_api_header_file} COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_api_source_file_tmp} ${sparse_api_source_file} COMMENT "copy_if_different ${sparse_api_header_file} ${sparse_sparse_api_source_file}" - DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} + DEPENDS ${sparse_api_yaml_file} ${sparse_api_gen_file} ${api_gen_base} ${api_gen_file} + VERBATIM) + +# generate backward sparse api +add_custom_command( + OUTPUT ${sparse_bw_api_header_file} ${sparse_bw_api_source_file} + COMMAND ${PYTHON_EXECUTABLE} ${sparse_bw_api_gen_file} + --api_yaml_path ${sparse_bw_api_yaml_file} + --api_header_path ${sparse_bw_api_header_file_tmp} + --api_source_path ${sparse_bw_api_source_file_tmp} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_header_file_tmp} ${sparse_bw_api_header_file} + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${sparse_bw_api_source_file_tmp} ${sparse_bw_api_source_file} + COMMENT "copy_if_different ${sparse_bw_api_header_file} ${sparse_bw_sparse_api_source_file}" + DEPENDS ${sparse_bw_api_yaml_file} ${sparse_bw_api_gen_file} ${api_gen_base} ${api_gen_file} ${sparse_api_gen_file} ${bw_api_gen_file} VERBATIM) # generate wrapped infermeta @@ -113,9 +134,10 @@ cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfe cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) -cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl) cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform) cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl) +cc_library(sparse_api SRCS ${sparse_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl) +cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api sparse_api_custom_impl) cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index b9f991f9b0f88..7bd488cc11475 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -35,6 +35,7 @@ def parse_forward_config(self, forward_config): forward_config) api = result.group('api') _, outputs, _ = self.parse_output(self.api, result.group('outputs')) + outputs = [item.split('@')[0] for item in outputs] fw_inputs, fw_attrs, _, = self.parse_input_and_attr( api, result.group('args')) diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py index 99c5a4f49f8c4..d845653f48831 100644 --- a/python/paddle/utils/code_gen/sparse_api_gen.py +++ b/python/paddle/utils/code_gen/sparse_api_gen.py @@ -17,10 +17,10 @@ import argparse import re -from api_base import BaseAPI +from api_gen import ForwardAPI -class SparseAPI(BaseAPI): +class SparseAPI(ForwardAPI): def __init__(self, api_item_yaml): super(SparseAPI, self).__init__(api_item_yaml) @@ -30,11 +30,6 @@ def get_api_name(self, api_item_yaml): def get_api_func_name(self): return self.api - def get_return_type(self, out_type_list): - return out_type_list[0] if len( - out_type_list) == 1 else "std::tuple<" + ",".join( - out_type_list) + ">" - def gene_api_declaration(self): return f""" // {", ".join(self.outputs['names'])} diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml new file mode 100644 index 0000000000000..c71dce502992f --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml @@ -0,0 +1,6 @@ +- sparse_bw_api : conv3d_grad + forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor) + args : (Tensor x, Tensor kernel, Tensor rulebook, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups) + output : Tensor(x_grad@DenseTensor), Tensor(kernel_grad@DenseTensor) + kernel : + func : sparse_conv_grad diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py new file mode 100644 index 0000000000000..6ef294caa1473 --- /dev/null +++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py @@ -0,0 +1,200 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import yaml +import argparse +import re + +from sparse_api_gen import SparseAPI +from backward_api_gen import BackwardAPI + + +class SparseBackwardAPI(SparseAPI, BackwardAPI): + def __init__(self, bw_api_item_yaml): + BackwardAPI.__init__(self, bw_api_item_yaml) + + def get_api_name(self, api_item_yaml): + return api_item_yaml['sparse_bw_api'] + + def get_api_func_name(self): + return self.api + + def get_return_type(self, out_type_list): + return BackwardAPI.get_return_type(self, out_type_list) + + def gene_api_declaration(self): + return SparseAPI.gene_api_declaration(self) + + def gene_output(self, + output_type_list, + set_out_func, + code_indent, + inplace_flag=False): + kernel_output = "" + output_names = [] + output_create = "" + + if len(output_type_list) == 1: + kernel_output = 'kernel_out' + output_names.append('kernel_out') + inplace_assign = " = " + self.inplace_map[self.outputs['names'][ + 0]] if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][0] in self.inplace_map else "" + output_create = f""" + {self.outputs['return_type']} out{inplace_assign}; + auto kernel_out = {set_out_func}(&out, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});""" + + elif len(output_type_list) > 1: + output_create = f""" + {self.outputs['return_type']} out({len(output_type_list)});""" + + for i, out_type_item in enumerate(output_type_list): + kernel_output = kernel_output + f'kernel_out_{i}, ' + output_names.append(f'kernel_out_{i}') + if out_type_item == 'Tensor': + get_out_code = f'&out[{i}][0]' + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + out[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});""" + + else: + output_create = output_create + f""" + out[{i}].emplace_back();""" + + else: + get_out_code = f'&out[{i}]' + if inplace_flag and self.inplace_map is not None and self.outputs[ + 'names'][i] in self.inplace_map: + output_create = output_create + f""" + out[{i}] = {self.inplace_map[self.outputs['names'][i]]};""" + + output_create = output_create + f""" + auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});""" + + kernel_output = kernel_output[:-2] + else: + raise ValueError( + "{} : Output error: the output should not be empty.".format( + self.api)) + + return kernel_output, output_names, output_create + + +def header_include(): + return """ +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" +""" + + +def source_include(header_file_path): + return f""" +#include "{header_file_path}" +#include + +#include "glog/logging.h" + +#include "paddle/phi/api/lib/api_registry.h" +#include "paddle/phi/api/lib/api_gen_utils.h" +#include "paddle/phi/api/lib/kernel_dispatch.h" +#include "paddle/phi/api/lib/sparse_api_custom_impl.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/declarations.h" +""" + + +def api_register(): + return """ +PD_REGISTER_API(Test); +""" + + +def api_namespace(): + return (""" +namespace paddle { +namespace experimental { +namespace sparse { + +""", """ + +} // namespace sparse +} // namespace experimental +} // namespace paddle +""") + + +def generate_api(api_yaml_path, header_file_path, source_file_path): + + with open(api_yaml_path, 'r') as f: + apis = yaml.load(f, Loader=yaml.FullLoader) + header_file = open(header_file_path, 'w') + source_file = open(source_file_path, 'w') + + namespace = api_namespace() + + header_file.write("#pragma once\n") + header_file.write(header_include()) + header_file.write(namespace[0]) + + include_header_file = "paddle/phi/api/backward/sparse_bw_api.h" + source_file.write(source_include(include_header_file)) + source_file.write(namespace[0]) + + for api in apis: + sparse_bw_api = SparseBackwardAPI(api) + header_file.write(sparse_bw_api.gene_api_declaration()) + source_file.write(sparse_bw_api.gene_api_code()) + + header_file.write(namespace[1]) + source_file.write(namespace[1]) + + source_file.write(api_register()) + + header_file.close() + source_file.close() + + +def main(): + parser = argparse.ArgumentParser( + description='Generate PaddlePaddle C++ Sparse API files') + parser.add_argument( + '--api_yaml_path', + help='path to sparse api yaml file', + default='python/paddle/utils/code_gen/sparse_bw_api.yaml') + + parser.add_argument( + '--api_header_path', + help='output of generated api header code file', + default='paddle/phi/api/backward/sparse_bw_api.h') + + parser.add_argument( + '--api_source_path', + help='output of generated api source code file', + default='paddle/phi/api/lib/sparse_bw_api.cc') + + options = parser.parse_args() + + api_yaml_path = options.api_yaml_path + header_file_path = options.api_header_path + source_file_path = options.api_source_path + + generate_api(api_yaml_path, header_file_path, source_file_path) + + +if __name__ == '__main__': + main() From 3c536f2e65c65cc986e8aaff86214426498d1f7a Mon Sep 17 00:00:00 2001 From: WJJ1995 Date: Tue, 8 Mar 2022 13:11:12 +0800 Subject: [PATCH 20/50] =?UTF-8?q?[phi]=20move=20isnan=5Fv2=E3=80=81isfinit?= =?UTF-8?q?e=5Fv2=E3=80=81isinf=5Fv2=20to=20phi=20(#40076)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * support isfinite for phi * mark v2 * fixed bugs * fixed include bugs * deal with comments * decoupling selected_rows * rm bfloat16 * fixed infermeta * fixed code style * rm useless code * replace pt by pd --- paddle/fluid/operators/isfinite_v2_op.cc | 64 ++++--------- paddle/fluid/operators/isfinite_v2_op.cu | 55 ----------- paddle/phi/core/compat/op_utils.h | 3 + paddle/phi/infermeta/unary.cc | 5 + paddle/phi/infermeta/unary.h | 2 + paddle/phi/kernels/cpu/isfinite_kernel.cc | 62 ++++++++++++ .../kernels/funcs/isfinite_functor.h} | 33 +++---- paddle/phi/kernels/gpu/isfinite_kernel.cu | 61 ++++++++++++ .../phi/kernels/impl/isfinite_kernel_impl.h | 39 ++++++++ paddle/phi/kernels/isfinite_kernel.h | 31 ++++++ .../kernels/selected_rows/isfinite_kernel.cc | 96 +++++++++++++++++++ .../kernels/selected_rows/isfinite_kernel.h | 31 ++++++ .../selected_rows/isfinite_kernel_impl.h | 39 ++++++++ paddle/phi/ops/compat/isfinite_sig.cc | 19 ++++ 14 files changed, 419 insertions(+), 121 deletions(-) delete mode 100644 paddle/fluid/operators/isfinite_v2_op.cu create mode 100644 paddle/phi/kernels/cpu/isfinite_kernel.cc rename paddle/{fluid/operators/isfinite_v2_op.h => phi/kernels/funcs/isfinite_functor.h} (52%) create mode 100644 paddle/phi/kernels/gpu/isfinite_kernel.cu create mode 100644 paddle/phi/kernels/impl/isfinite_kernel_impl.h create mode 100644 paddle/phi/kernels/isfinite_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.cc create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel.h create mode 100644 paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h create mode 100644 paddle/phi/ops/compat/isfinite_sig.cc diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203b1..cfa370ff9cb19 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36dfa0..0000000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 8f64a7145eda5..9947e00ecb53c 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -40,6 +40,9 @@ const std::unordered_set standard_kernel_suffixs({ const std::unordered_set deprecated_op_names({"diag", "flatten", "flatten_grad", + "isinf", + "isnan", + "isfinite", "matmul", "matmul_grad", "matmul_grad_grad", diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 4053cfbc362e3..17edc84618726 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -1007,6 +1007,11 @@ void SizeInferMeta(const MetaTensor& input, MetaTensor* out) { out->set_dims({1}); } +void IsfiniteInferMeta(const MetaTensor& x, MetaTensor* out) { + out->set_dims(x.dims()); + out->set_dtype(DataType::BOOL); +} + void PixelShuffleInferMeta(const MetaTensor& x, int upscale_factor, const std::string& data_format, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index a679ef8c11af6..dac7c19cf9b08 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -157,6 +157,8 @@ void PixelShuffleInferMeta(const MetaTensor& x, const std::string& data_format, MetaTensor* out); +void IsfiniteInferMeta(const MetaTensor& input, MetaTensor* out); + void TransposeInferMeta(const MetaTensor& x, const std::vector& axis, MetaTensor* out); diff --git a/paddle/phi/kernels/cpu/isfinite_kernel.cc b/paddle/phi/kernels/cpu/isfinite_kernel.cc new file mode 100644 index 0000000000000..33a7429a22a1a --- /dev/null +++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/isfinite_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(isinf, + CPU, + ALL_LAYOUT, + phi::IsinfKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan, + CPU, + ALL_LAYOUT, + phi::IsnanKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite, + CPU, + ALL_LAYOUT, + phi::IsfiniteKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/fluid/operators/isfinite_v2_op.h b/paddle/phi/kernels/funcs/isfinite_functor.h similarity index 52% rename from paddle/fluid/operators/isfinite_v2_op.h rename to paddle/phi/kernels/funcs/isfinite_functor.h index b646e460ec75b..c804bee8d4c68 100644 --- a/paddle/fluid/operators/isfinite_v2_op.h +++ b/paddle/phi/kernels/funcs/isfinite_functor.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,39 +14,32 @@ #pragma once -#include - -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/isfinite_op.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace operators { +namespace funcs { struct InfinityV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorContainsInfV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorContainsInfV2(tensor, out); } }; struct NANV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorContainsNANV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorContainsNANV2(tensor, out); } }; struct IsfiniteV2Functor { - void operator()(const framework::Tensor& tensor, framework::Tensor* out) { - framework::TensorIsfiniteV2(tensor, out); + void operator()(const DenseTensor& tensor, DenseTensor* out) { + paddle::framework::TensorIsfiniteV2(tensor, out); } }; -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/isfinite_kernel.cu b/paddle/phi/kernels/gpu/isfinite_kernel.cu new file mode 100644 index 0000000000000..4b41ed1e55d39 --- /dev/null +++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu @@ -0,0 +1,61 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/isfinite_kernel_impl.h" +#include "paddle/phi/kernels/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL(isinf, + GPU, + ALL_LAYOUT, + phi::IsinfKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan, + GPU, + ALL_LAYOUT, + phi::IsnanKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite, + GPU, + ALL_LAYOUT, + phi::IsfiniteKernel, + float, + double, + phi::dtype::float16, + int, + int64_t) {} diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h new file mode 100644 index 0000000000000..affa85f8a2d28 --- /dev/null +++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/isfinite_functor.h" +#include "paddle/phi/kernels/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteKernelImpl(const Context& ctx, + const DenseTensor& x, + DenseTensor* out); + +#define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor) \ + template \ + void isfinite_kernel( \ + const Context& ctx, const DenseTensor& x, DenseTensor* out) { \ + IsfiniteKernelImpl(ctx, x, out); \ + } + +DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor) +DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor) +DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor) +#undef DEFINE_ISFINITE_KERNEL + +} // namespace phi diff --git a/paddle/phi/kernels/isfinite_kernel.h b/paddle/phi/kernels/isfinite_kernel.h new file mode 100644 index 0000000000000..e695a8e074223 --- /dev/null +++ b/paddle/phi/kernels/isfinite_kernel.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +#define DEFINE_ISFINITE_KERNEL(isfinite_kernel) \ + template \ + void isfinite_kernel( \ + const Context& ctx, const DenseTensor& x, DenseTensor* out); + +DEFINE_ISFINITE_KERNEL(IsinfKernel) +DEFINE_ISFINITE_KERNEL(IsnanKernel) +DEFINE_ISFINITE_KERNEL(IsfiniteKernel) +#undef DEFINE_ISFINITE_KERNEL + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc new file mode 100644 index 0000000000000..a507cdd0d866c --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#include "paddle/phi/backends/gpu/gpu_context.h" +#endif +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h" + +namespace phi { + +template +inline void IsfiniteSRImpl(const Context& dev_ctx, + const SelectedRows& x, + SelectedRows* out) { + dev_ctx.template Alloc(out); + Functor functor; + functor(x.value(), out->mutable_value()); +} +} // namespace phi + +PD_REGISTER_KERNEL(isinf_sr, + CPU, + ALL_LAYOUT, + phi::IsinfSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan_sr, + CPU, + ALL_LAYOUT, + phi::IsnanSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite_sr, + CPU, + ALL_LAYOUT, + phi::IsfiniteSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL(isinf_sr, + GPU, + ALL_LAYOUT, + phi::IsinfSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isnan_sr, + GPU, + ALL_LAYOUT, + phi::IsnanSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} + +PD_REGISTER_KERNEL(isfinite_sr, + GPU, + ALL_LAYOUT, + phi::IsfiniteSR, + float, + double, + phi::dtype::float16, + int, + int64_t) {} +#endif diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.h b/paddle/phi/kernels/selected_rows/isfinite_kernel.h new file mode 100644 index 0000000000000..948d8c89477a2 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/selected_rows.h" + +namespace phi { + +#define DEFINE_ISFINITE_SR(isfinite_sr) \ + template \ + void isfinite_sr( \ + const Context& ctx, const SelectedRows& x, SelectedRows* out); + +DEFINE_ISFINITE_SR(IsinfSR) +DEFINE_ISFINITE_SR(IsnanSR) +DEFINE_ISFINITE_SR(IsfiniteSR) +#undef DEFINE_ISFINITE_SR + +} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h new file mode 100644 index 0000000000000..c53abdf996c47 --- /dev/null +++ b/paddle/phi/kernels/selected_rows/isfinite_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/isfinite_functor.h" +#include "paddle/phi/kernels/selected_rows/isfinite_kernel.h" + +namespace phi { + +template +inline void IsfiniteSRImpl(const Context& ctx, + const SelectedRows& x, + SelectedRows* out); + +#define DEFINE_ISFINITE_SR(isfinite_sr, functor) \ + template \ + void isfinite_sr( \ + const Context& ctx, const SelectedRows& x, SelectedRows* out) { \ + IsfiniteSRImpl(ctx, x, out); \ + } + +DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor) +DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor) +DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor) +#undef DEFINE_ISFINITE_SR + +} // namespace phi diff --git a/paddle/phi/ops/compat/isfinite_sig.cc b/paddle/phi/ops/compat/isfinite_sig.cc new file mode 100644 index 0000000000000..218b4c2f962c4 --- /dev/null +++ b/paddle/phi/ops/compat/isfinite_sig.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +PD_REGISTER_BASE_KERNEL_NAME(isinf_v2, isinf); +PD_REGISTER_BASE_KERNEL_NAME(isnan_v2, isnan); +PD_REGISTER_BASE_KERNEL_NAME(isfinite_v2, isfinite); From 13f2b1e381d6ab112dd431bdb415e7ea04fbb7b7 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 8 Mar 2022 13:29:26 +0800 Subject: [PATCH 21/50] [phi] transfer accuracy op and pass the unittests (#39982) * transfer accuracy op and pass the ci * remove header file * fix code * fix code * fix * fix --- paddle/fluid/operators/metrics/accuracy_op.cc | 9 +- paddle/fluid/operators/metrics/accuracy_op.cu | 110 ---------------- paddle/fluid/operators/metrics/accuracy_op.h | 74 ----------- .../operators/metrics/accuracy_op_mlu.cc | 3 +- .../operators/metrics/accuracy_op_npu.cc | 2 +- .../operators/metrics/accuracy_op_xpu.cc | 4 +- paddle/phi/kernels/accuracy_kernel.h | 30 +++++ paddle/phi/kernels/cpu/accuracy_kernel.cc | 72 +++++++++++ paddle/phi/kernels/gpu/accuracy_kernel.cu | 117 ++++++++++++++++++ 9 files changed, 228 insertions(+), 193 deletions(-) delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.cu delete mode 100644 paddle/fluid/operators/metrics/accuracy_op.h create mode 100644 paddle/phi/kernels/accuracy_kernel.h create mode 100644 paddle/phi/kernels/cpu/accuracy_kernel.cc create mode 100644 paddle/phi/kernels/gpu/accuracy_kernel.cu diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5a4..056620db5b966 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -123,13 +123,10 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d37..0000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e67..0000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277c9..1ce02ff4525c9 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index e83278f88b82a..9f2ca4165f33a 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78df9..3cc1be4de8a82 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/phi/kernels/accuracy_kernel.h b/paddle/phi/kernels/accuracy_kernel.h new file mode 100644 index 0000000000000..8f2dbb96f8654 --- /dev/null +++ b/paddle/phi/kernels/accuracy_kernel.h @@ -0,0 +1,30 @@ + +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total); +} // namespace phi diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc new file mode 100644 index 0000000000000..c57ec69b73a23 --- /dev/null +++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/accuracy_kernel.h" + +#include +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { + int* correct_data = dev_ctx.template Alloc(correct); + int* total_data = dev_ctx.template Alloc(total); + float* accuracy_data = dev_ctx.template Alloc(accuracy); + + const int64_t* indices_data = indices.data(); + const int64_t* label_data = label.data(); + + size_t num_samples = inference.dims()[0]; + size_t class_dim = inference.dims()[1]; + *accuracy_data = 0.0f; + + if (num_samples == 0) { + return; + } + + int num_correct = 0; + // assume inference is already the topk of the output + for (size_t i = 0; i < num_samples; ++i) { + PADDLE_ENFORCE_GE( + label_data[i], + 0, + phi::errors::InvalidArgument( + "label of AccuracyOp must >= 0, But received label[%d] is %d", + i, + label_data[i])); + for (size_t j = 0; j < class_dim; ++j) { + if (indices_data[i * class_dim + j] == label_data[i]) { + ++num_correct; + break; + } + } + } + + *correct_data = num_correct; + *total_data = num_samples; + *accuracy_data = + static_cast(num_correct) / static_cast(num_samples); +} +} // namespace phi + +// TODO(add supported dtype.) +PD_REGISTER_KERNEL( + accuracy, CPU, ALL_LAYOUT, phi::AccuracyRawKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu new file mode 100644 index 0000000000000..f08fb74e54d8c --- /dev/null +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -0,0 +1,117 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/accuracy_kernel.h" + +#include +#include +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +using paddle::platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void AccuracyCudaKernel(const int N, + const int D, + const int64_t* Xdata, + const int64_t* labeldata, + int* correct_data, + float* accuracy, + int* total_data) { + int count = 0; + __shared__ int total[BlockSize]; + + // support only 1 block + for (int i = threadIdx.x; i < (N); i += BlockSize) { + for (int j = 0; j < D; ++j) { + if (Xdata[i * D + j] == labeldata[i]) { + ++count; + break; + } + } + } + total[threadIdx.x] = count; + __syncthreads(); + +// reduce the count with init value 0, and output accuracy. +#ifdef PADDLE_WITH_CUDA + int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); +#else + // HIP thrust::reduce not support __device__ + for (int s = BlockSize / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + total[threadIdx.x] += total[threadIdx.x + s]; + } + __syncthreads(); + } + int result = total[0]; +#endif + if (threadIdx.x == 0) { + *correct_data = result; + *accuracy = static_cast(result) / static_cast(N); + *total_data = N; + } +} + +template +void AccuracyRawKernel(const Context& dev_ctx, + const DenseTensor& inference, + const DenseTensor& indices, + const DenseTensor& label, + DenseTensor* accuracy, + DenseTensor* correct, + DenseTensor* total) { + // FIXME(typhoonzero): only support indices currently + // if add support for output values, how to detect the data type? + const int64_t* indices_data = indices.data(); + const int64_t* label_data = label.data(); + + int* correct_data = dev_ctx.template Alloc(correct); + int* total_data = dev_ctx.template Alloc(total); + float* accuracy_data = dev_ctx.template Alloc(accuracy); + + int num_samples = static_cast(inference.dims()[0]); + size_t infer_width = inference.dims()[1]; + auto stream = dev_ctx.stream(); + phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); + + if (num_samples == 0) { + return; + } + + AccuracyCudaKernel< + PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + num_samples, + infer_width, + indices_data, + label_data, + correct_data, + accuracy_data, + total_data); +} +} // namespace phi + +// FIXME(typhoonzero): types of T is for inference data. +// label data is always int64 +PD_REGISTER_KERNEL(accuracy, + GPU, + ALL_LAYOUT, + phi::AccuracyRawKernel, + phi::dtype::float16, + float, + double) {} From a279a4f8576667bf86258f1e9e59b6a05b6ce00e Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 8 Mar 2022 13:57:42 +0800 Subject: [PATCH 22/50] [IPU] update ipu unittests p2 (#40069) * update ipu UTs part2 * clean git * rename ut * rename ut 1 * sync api changes * update uts for new api * update uts for new api * fix re-define --- .../tests/unittests/ipu/test_ipu_pipeline.py | 71 ------ .../tests/unittests/ipu/test_ipu_place.py | 51 ----- .../tests/unittests/ipu/test_ipu_shard.py | 70 ------ .../unittests/ipu/test_ipu_shard_api_ipu.py | 112 ++++++++++ .../tests/unittests/ipu/test_ipu_strategy.py | 56 ----- .../unittests/ipu/test_ipu_strategy_ipu.py | 72 ++++++ .../unittests/ipu/test_layernorm_op_ipu.py | 134 +++++++---- .../unittests/ipu/test_log_softmax_op_ipu.py | 87 ++++---- .../unittests/ipu/test_logical_not_op_ipu.py | 97 ++++++++ .../unittests/ipu/test_lookuptable_op_ipu.py | 102 ++++----- .../ipu/test_lookuptable_v2_op_ipu.py | 141 ++++++++++++ ...lr_sheduelr.py => test_lr_sheduler_ipu.py} | 6 +- .../tests/unittests/ipu/test_matmul_op_ipu.py | 208 ++++++++++-------- 13 files changed, 723 insertions(+), 484 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_lr_sheduelr.py => test_lr_sheduler_ipu.py} (95%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py deleted file mode 100644 index beab68553d723..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_pipeline.py +++ /dev/null @@ -1,71 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuShard(unittest.TestCase): - def _test(self): - # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_stage : no - - with paddle.fluid.ipu_shard(ipu_stage=1): - c = b + 1 # scale, ipu_stage : 1 - with paddle.fluid.ipu_shard(ipu_stage=2): - d = c * 2 # scale, ipu_stage : 2 - with paddle.fluid.ipu_shard(ipu_stage=3): - e = d + 3 # scale, ipu_stage : 3 - with paddle.fluid.ipu_shard(ipu_stage=1): - e = e + 3 # scale, ipu_stage : 1 - with paddle.fluid.ipu_shard(ipu_stage=2): - e = e + 3 # scale, ipu_stage : 2 - - with paddle.fluid.ipu_shard(ipu_stage=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 - - with paddle.fluid.ipu_shard(ipu_stage=2): - g = f - 1 # scale, ipu_stage : 2 - - h = g + 1 # scale, ipu_stage : no - - ipu_index_list = [] - main_prog = paddle.static.default_main_program() - for op in main_prog.global_block().ops: - if op.desc.has_attr("ipu_stage"): - ipu_index_list.append(op.desc.attr("ipu_stage")) - - return ipu_index_list - - def test_ipu_shard(self): - ipu_index_list = self._test() - expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] - - self.assertTrue( - np.allclose( - ipu_index_list, expected_ipu_index_list, atol=0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py deleted file mode 100644 index 48ab046deb370..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_place.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -sys.path.append("..") -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuPlace(unittest.TestCase): - def test_ipu_place(self): - num_devices = fluid.core.get_ipu_device_count() - self.assertGreater(num_devices, 0) - - for i in range(num_devices): - place = paddle.IPUPlace() - p = fluid.core.Place() - p.set_place(place) - self.assertTrue(p.is_ipu_place()) - - def test_ipu_set_device(self): - num_devices = fluid.core.get_ipu_device_count() - self.assertGreater(num_devices, 0) - - for i in range(num_devices): - paddle.set_device('ipu') - device = paddle.get_device() - self.assertTrue(device == "ipus:{{0-{}}}".format(num_devices - 1)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py deleted file mode 100644 index 368556d8b2f2d..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid - -paddle.enable_static() - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestIpuShard(unittest.TestCase): - def _test(self): - # build graph - a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') - b = a + 2 # scale : scale * x + bias, ipu_index : no - - with paddle.fluid.ipu_shard(ipu_index=1): - c = b + 1 # scale, ipu_index : 1 - with paddle.fluid.ipu_shard(ipu_index=2): - d = c * 2 # scale, ipu_index : 2 - with paddle.fluid.ipu_shard(ipu_index=3): - e = d + 3 # scale, ipu_index : 3 - with paddle.fluid.ipu_shard(ipu_index=1): - e = e + 3 # scale, ipu_index : 1 - with paddle.fluid.ipu_shard(ipu_index=2): - e = e + 3 # scale, ipu_index : 2 - - with paddle.fluid.ipu_shard(ipu_index=1): - f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 - - with paddle.fluid.ipu_shard(ipu_index=2): - g = f - 1 # scale, ipu_index : 2 - - h = g + 1 # scale, ipu_index : no - - ipu_index_list = [] - main_prog = paddle.static.default_main_program() - for op in main_prog.global_block().ops: - if op.desc.has_attr("ipu_index"): - ipu_index_list.append(op.desc.attr("ipu_index")) - - return ipu_index_list - - def test_ipu_shard(self): - ipu_index_list = self._test() - expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] - self.assertTrue( - np.allclose( - ipu_index_list, expected_ipu_index_list, atol=0)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py new file mode 100644 index 0000000000000..026b19eccf187 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_shard_api_ipu.py @@ -0,0 +1,112 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import paddle + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuShard(unittest.TestCase): + def _test(self): + # build graph + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_index : no + + with paddle.static.ipu_shard_guard(index=1): + c = b + 1 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + d = c * 2 # scale, ipu_index : 2 + with paddle.static.ipu_shard_guard(index=3): + e = d + 3 # scale, ipu_index : 3 + with paddle.static.ipu_shard_guard(index=1): + e = e + 3 # scale, ipu_index : 1 + with paddle.static.ipu_shard_guard(index=2): + e = e + 3 # scale, ipu_index : 2 + + with paddle.static.ipu_shard_guard(index=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_index : 1 + + with paddle.static.ipu_shard_guard(index=2): + g = f - 1 # scale, ipu_index : 2 + + h = g + 1 # scale, ipu_index : no + + ipu_index_list = [] + main_prog = paddle.static.default_main_program() + for op in main_prog.global_block().ops: + if op.desc.has_attr("ipu_index"): + ipu_index_list.append(op.desc.attr("ipu_index")) + + return ipu_index_list + + def test_ipu_shard(self): + ipu_index_list = self._test() + expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] + self.assertTrue( + np.allclose( + ipu_index_list, expected_ipu_index_list, atol=0)) + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuPipeline(unittest.TestCase): + def _test(self): + # build graph + a = paddle.static.data(name='data', shape=[None, 1], dtype='int32') + b = a + 2 # scale : scale * x + bias, ipu_stage : no + + with paddle.static.ipu_shard_guard(stage=1): + c = b + 1 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + d = c * 2 # scale, ipu_stage : 2 + with paddle.static.ipu_shard_guard(stage=3): + e = d + 3 # scale, ipu_stage : 3 + with paddle.static.ipu_shard_guard(stage=1): + e = e + 3 # scale, ipu_stage : 1 + with paddle.static.ipu_shard_guard(stage=2): + e = e + 3 # scale, ipu_stage : 2 + + with paddle.static.ipu_shard_guard(stage=1): + f = paddle.tensor.pow(e, 2.0) # pow, ipu_stage : 1 + + with paddle.static.ipu_shard_guard(stage=2): + g = f - 1 # scale, ipu_stage : 2 + + h = g + 1 # scale, ipu_stage : no + + ipu_index_list = [] + main_prog = paddle.static.default_main_program() + for op in main_prog.global_block().ops: + if op.desc.has_attr("ipu_stage"): + ipu_index_list.append(op.desc.attr("ipu_stage")) + + return ipu_index_list + + def test_ipu_shard(self): + ipu_index_list = self._test() + expected_ipu_index_list = [1, 2, 3, 1, 2, 1, 2] + + self.assertTrue( + np.allclose( + ipu_index_list, expected_ipu_index_list, atol=0)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py deleted file mode 100644 index afeec9ee1b6fa..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler - -paddle.enable_static() -SEED = 2021 - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestConvNet(unittest.TestCase): - def test_training(self): - ipu_strategy = paddle.static.IpuStrategy() - - assert ipu_strategy.num_ipus == 1, "Default num_ipus must be 1" - assert ipu_strategy.is_training == True, "Default is_training is True" - assert ipu_strategy.enable_pipelining == False, \ - "Default enable_pipelining is False" - assert ipu_strategy.enable_manual_shard == False, \ - "Default enable_manual_shard is False" - - ipu_strategy.SetGraphConfig( - num_ipus=2, is_training=False, enable_manual_shard=True) - ipu_strategy.SetPipeliningConfig(enable_pipelining=True) - assert ipu_strategy.num_ipus == 2, "Set num_ipus Failed" - - assert ipu_strategy.is_training == False, "Set is_training Failed" - - assert ipu_strategy.enable_pipelining == True, \ - "Set enable_pipelining Failed" - - assert ipu_strategy.enable_manual_shard == True, \ - "Set enable_manual_shard Failed" - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py new file mode 100644 index 0000000000000..f120f5594914e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.static + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestIpuStrategy(unittest.TestCase): + def test_set_options(self): + ipu_strategy = paddle.static.IpuStrategy() + all_option_names = ipu_strategy._ipu_strategy.get_all_option_names() + for option_name in all_option_names: + option = ipu_strategy._ipu_strategy.get_option(option_name) + option_type = option['type'] + option_value = option['value'] + if option_type in ['double']: + set_value = option_value + 0.5 + elif option_type == 'uint64': + set_value = option_value + 1 + elif option_type == 'bool': + set_value = not option_value + else: + continue + ipu_strategy.set_options({option_name: set_value}) + new_value = ipu_strategy.get_option(option_name) + assert new_value == set_value, f"set {option_name} to {set_value} failed" + + def test_set_string_options(self): + ipu_strategy = paddle.static.IpuStrategy() + options = { + 'cache_path': 'paddle_cache', + 'log_dir': 'paddle_log', + 'partials_type_matmuls': 'half', + 'partials_type_matmuls': 'float', + } + ipu_strategy.set_options(options) + for k, v in options.items(): + assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed " + + def test_set_other_options(self): + ipu_strategy = paddle.static.IpuStrategy() + options = {} + options['dot_checks'] = ['0', '1', '2', '3'] + options['engine_options'] = { + 'debug.allowOutOfMemory': 'true', + 'autoReport.directory': 'path', + 'autoReport.all': 'true' + } + for k, v in options.items(): + ipu_strategy.set_options({k: v}) + assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed " + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py index 196f94b68f94a..a52946bba1567 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_layernorm_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,44 +26,52 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 1, "epsilon": 1e-05, } + self.optimizer = None - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -80,33 +82,38 @@ def _test_base(self, run_ipu=True): out = paddle.fluid.layers.nn.layer_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) else: - # scale = True - # bias = True scale = self.attrs['scale'] bias = self.attrs['shift'] out = paddle.fluid.layers.nn.layer_norm( x, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) - if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) - fetch_list = [loss.name] - else: - fetch_list = [out.name] + fetch_list = [loss.name] - if run_ipu: + if self.is_training: + optimizer = None + if self.optimizer == 'sgd': + optimizer = paddle.optimizer.SGD(learning_rate=1e-2) + elif self.optimizer == 'adam': + optimizer = paddle.optimizer.Adam(learning_rate=1e-2) + elif self.optimizer == 'lamb': + optimizer = paddle.optimizer.Lamb( + learning_rate=1e-2, lamb_weight_decay=0.0) + if optimizer is not None: + optimizer.minimize(loss) + + if exec_mode: place = paddle.IPUPlace() else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -116,12 +123,14 @@ def _test_base(self, run_ipu=True): result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=self.feed_fp32, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, + feed=self.feed_fp32, + fetch_list=fetch_list) return result[0] def test_base(self): @@ -137,7 +146,7 @@ def test_base(self): @unittest.skip('raise error') class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": False, "shift": True, @@ -148,7 +157,7 @@ def set_attrs(self): @unittest.skip('raise error') class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": False, @@ -158,18 +167,28 @@ def set_attrs(self): class TestCase3(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 2, "epsilon": 1e-05, } + self.optimizer = None class TestTrainCase1(TestBase): + def set_op_attrs(self): + self.attrs = { + "scale": True, + "shift": True, + "begin_norm_axis": 1, + "epsilon": 1e-05 + } + self.optimizer = 'sgd' + def set_atol(self): - self.atol = 1e-3 + self.atol = 1e-6 def set_training(self): self.is_training = True @@ -178,15 +197,34 @@ def set_training(self): class TestTrainCase2(TestBase): def set_atol(self): - self.atol = 1e-3 + self.atol = 5e-4 - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": True, "shift": True, "begin_norm_axis": 2, - "epsilon": 1e-05, + "epsilon": 1e-05 + } + self.optimizer = 'adam' + + def set_training(self): + self.is_training = True + self.epoch = 10 + + +class TestTrainCase3(TestBase): + def set_atol(self): + self.atol = 5e-3 + + def set_op_attrs(self): + self.attrs = { + "scale": True, + "shift": True, + "begin_norm_axis": 2, + "epsilon": 1e-05 } + self.optimizer = 'lamb' def set_training(self): self.is_training = True diff --git a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py index dc3cab6ac5e11..fad7516e442a7 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_log_softmax_op_ipu.py @@ -16,15 +16,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer -import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) import paddle.nn.functional as F - -paddle.enable_static() +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -33,72 +27,81 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + self.feed_list = list(self.feed_fp32.keys()) def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -1} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = F.log_softmax(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py new file mode 100644 index 0000000000000..3f8472890d03e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_logical_not_op_ipu.py @@ -0,0 +1,97 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 20, 30528]) + self.feed = {"in_0": data.astype('bool')} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype="bool") + + out = paddle.fluid.layers.logical_not(x) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).astype(np.int32) + + self.check(output_dict, check_shape=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py index 31b0c99603c3f..4a877ddce4e3c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,16 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.array([[[1], [3]], [[2], [4]], [[4], [127]]]) + self.feed_cpu = {"x": data.astype(np.int64)} + self.feed_ipu = {"x": data.astype(np.int32)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + self.feed_dtype = [x.dtype for x in self.feed_cpu.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "size": [128, 16], "is_sparse": False, @@ -50,33 +53,20 @@ def set_attrs(self): "dtype": 'float32' } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - - if run_ipu: - self.feed = { - "x": np.array( - [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int32) - } - else: - self.feed = { - "x": np.array( - [[[1], [3]], [[2], [4]], [[4], [127]]]).astype(np.int64) - } + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - self.set_feed_attr() - - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='int64') + out = paddle.fluid.layers.embedding(x, **self.attrs) if self.is_training: @@ -87,47 +77,61 @@ def _test_base(self, run_ipu=True): else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_cpu + if exec_mode > ExecutionMode.CPU_FP32: + feed = self.feed_ipu + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or + self.is_training): + break - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestTrainCase1(TestBase): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + def set_training(self): self.is_training = True self.epoch = 10 diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py new file mode 100644 index 0000000000000..da8048fb3205e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_lookuptable_v2_op_ipu.py @@ -0,0 +1,141 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.array([[[1], [3]], [[2], [4]], [[4], [127]]]) + self.feed_cpu = {"x": x.astype(np.int64)} + self.feed_ipu = {"x": x.astype(np.int32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + self.feed_dtype = [x.dtype for x in self.feed_cpu.values()] + + def set_op_attrs(self): + self.attrs = { + "num_embeddings": 128, + "embedding_dim": 16, + "sparse": False, + "padding_idx": -1, + "weight_attr": None + } + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int64') + + embedding = paddle.nn.Embedding(**self.attrs) + out = embedding(x) + + if self.is_training: + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + fetch_list = [loss.name] + else: + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_cpu + if exec_mode > ExecutionMode.CPU_FP32: + feed = self.feed_ipu + + if self.is_training: + result = [] + for _ in range(self.epoch): + loss_res = exe.run(program, + feed=feed, + fetch_list=fetch_list) + result.append(loss_res[0]) + return np.array(result) + else: + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and (not self.fp16_enabled or + self.is_training): + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestTrainCase1(TestBase): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-6 + self.atol_fp16 = 1e-3 + self.rtol_fp16 = 1e-3 + + def set_training(self): + self.is_training = True + self.epoch = 10 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py similarity index 95% rename from python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py rename to python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py index 38b91785aeec8..58f018e2ae649 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduelr.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_lr_sheduler_ipu.py @@ -19,7 +19,7 @@ import sys import paddle import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static from paddle.optimizer.lr import LRScheduler paddle.enable_static() @@ -71,8 +71,8 @@ def _test(self, run_ipu=True): feed_list = [image.name] fetch_list = [loss.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py index c6702b92ab969..6929ded6ebf90 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,85 +26,93 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[2, 3]).astype('float32'), - "y": np.random.uniform(size=[3, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[20, 30]) + y = np.random.uniform(size=[30, 20]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": False, "transpose_y": False, "alpha": 1.0, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.matmul(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -119,55 +121,64 @@ def set_attrs(self): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, "alpha": 3.14, } + def set_atol(self): + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + class TestCase3(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[5, 4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[5, 4, 3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[5, 4, 3, 2]) + y = np.random.uniform(size=[5, 4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase4(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[4, 3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[4, 3, 2]) + y = np.random.uniform(size=[4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase5(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[3, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase6(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} @unittest.skip("not supported") class TestCase6_2(TestCase6): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -176,27 +187,36 @@ def set_attrs(self): class TestCase7(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 1]).astype('float32'), - "y": np.random.uniform(size=[1, 2]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 12, 128, 64]) + y = np.random.uniform(size=[1, 12, 128, 64]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"transpose_x": False, "transpose_y": True, "alpha": 0.125} + + +class TestCase8(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 1]) + y = np.random.uniform(size=[1, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} @unittest.skip("not supported") -class TestCase7_2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3]).astype('float32'), - "y": np.random.uniform(size=[2]).astype('float32'), - } - # equal to - # self.feed = { - # "x": np.random.uniform(size=[3, 1]).astype('float32'), - # "y": np.random.uniform(size=[1, 2]).astype('float32'), - # } +class TestCase8_2(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[2]) - def set_attrs(self): + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): self.attrs = { "transpose_x": True, "transpose_y": True, @@ -205,12 +225,12 @@ def set_attrs(self): @unittest.skip("dim > 4 is not supported") -class TestCase8(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'), - "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'), - } +class TestCase9(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[6, 5, 4, 2, 3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": x.astype(np.float16)} if __name__ == "__main__": From 061044a0cc199f03645a9cbc836f46da3930329d Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Tue, 8 Mar 2022 13:59:03 +0800 Subject: [PATCH 23/50] [IPU] update ipu unittests p4 (#40073) * update ipu UTs part4 * rename uts * sync api changes * update uts for new api --- .../unittests/ipu/test_set_batch_size_ipu.py | 96 +++++----- .../tests/unittests/ipu/test_sgd_optimizer.py | 88 --------- .../tests/unittests/ipu/test_slice_op_ipu.py | 122 +++++++------ .../unittests/ipu/test_softmax_op_ipu.py | 87 ++++----- .../tests/unittests/ipu/test_split_op_ipu.py | 113 ++++++++++++ .../unittests/ipu/test_squeeze_op_ipu.py | 91 +++++----- .../tests/unittests/ipu/test_stack_op_ipu.py | 102 ++++++----- .../tests/unittests/ipu/test_sum_op_ipu.py | 143 ++++++++------- .../tests/unittests/ipu/test_topk_op_ipu.py | 171 +++++++++--------- .../unittests/ipu/test_transpose_op_ipu.py | 98 +++++----- .../unittests/ipu/test_unsqueeze_op_ipu.py | 86 ++++----- ...inplace.py => test_varname_inplace_ipu.py} | 37 ++-- .../unittests/ipu/test_weight_sharing_ipu.py | 126 +++++++++++++ 13 files changed, 782 insertions(+), 578 deletions(-) delete mode 100644 python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_varname_inplace.py => test_varname_inplace_ipu.py} (79%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py index 93945b98ef0a2..9a18922f35331 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_set_batch_size_ipu.py @@ -16,13 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,36 +26,46 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([-1, 3, 128, 128]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=[2, 3, 128, 128]).astype(np.float32) - - self.feed_list = list(self.feed.keys()) - - def set_attrs(self): + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 3e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 128, 128]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + conv1 = paddle.static.nn.conv2d( x, num_filters=3, filter_size=3, bias_attr=False) conv2 = paddle.static.nn.conv2d( @@ -70,36 +75,45 @@ def _test_base(self, run_ipu=True): conv4 = paddle.static.nn.conv2d( conv3, num_filters=3, filter_size=3, bias_attr=False) - fetch_list = [conv4.name] + fetch_list = [conv4.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( - batch_size=2, is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + # set batch size + ipu_strategy.micro_batch_size = 2 + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py b/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py deleted file mode 100644 index df0e2a040bd3e..0000000000000 --- a/python/paddle/fluid/tests/unittests/ipu/test_sgd_optimizer.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -import sys -import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler - -paddle.enable_static() -SEED = 2021 - - -@unittest.skipIf(not paddle.is_compiled_with_ipu(), - "core is not compiled with IPU") -class TestSGD(unittest.TestCase): - def _test_sgd(self, run_ipu=True): - scope = fluid.core.Scope() - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) - - with fluid.scope_guard(scope): - with paddle.static.program_guard(main_prog, startup_prog): - image = paddle.static.data( - name='image', shape=[1, 3, 10, 10], dtype='float32') - conv1 = paddle.static.nn.conv2d( - image, num_filters=3, filter_size=3, bias_attr=False) - loss = paddle.mean(conv1) - - sgd = paddle.optimizer.SGD(learning_rate=1e-1) - sgd.minimize(loss) - - if run_ipu: - place = paddle.IPUPlace() - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - if run_ipu: - feed_list = [image.name] - fetch_list = [loss.name] - ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=True) - program = compiler.IPUCompiledProgram( - main_prog, ipu_strategy=ipu_strategy).compile(feed_list, - fetch_list) - else: - program = main_prog - - result = [] - for epoch in range(100): - loss_res = exe.run(program, - feed={"image": np_image}, - fetch_list=[loss]) - result.append(loss_res) - - return np.array(result) - - def test_sgd(self): - # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1) - ipu_loss = self._test_sgd(True).flatten() - cpu_loss = self._test_sgd(False).flatten() - - self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=1e-4)) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py index 3bdfeabce6592..8881f018de3b5 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_slice_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,78 +26,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[4, 5, 6]).astype('float32'), } + def set_data_feed(self): + data = np.random.uniform(size=[4, 5, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "axes": [0, 1, 2], "starts": [-3, 0, 2], "ends": [3, 2, 4], } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.slice(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "axes": [0, 1], "starts": [0, 0], @@ -113,38 +117,45 @@ def set_attrs(self): @unittest.skip('dynamic graph is not support on IPU') class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[4, 5, 6]).astype('float32'), - "starts": np.array([0, 0, 2]).astype('int32'), - "ends": np.array([3, 2, 4]).astype('int32'), + def set_data_feed(self): + x = np.random.uniform(size=[4, 5, 6]) + s = np.array([0, 0, 2]) + e = np.array([3, 2, 4]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "starts": s.astype(np.int32), + "ends": e.astype(np.int32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "starts": s.astype(np.int32), + "ends": e.astype(np.int32) } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [0, 1, 2]} def _test_base(self, run_ipu=True): scope = fluid.core.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED with fluid.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') starts = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='int32') ends = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='int32') out = paddle.fluid.layers.slice( x, starts=starts, ends=ends, **self.attrs) @@ -160,8 +171,8 @@ def _test_base(self, run_ipu=True): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -170,6 +181,9 @@ def _test_base(self, run_ipu=True): result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] + def test_base(self): + pass + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py index a4a4b83baf35e..25201959cecbc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py @@ -13,16 +13,11 @@ # limitations under the License. import unittest + import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,76 +26,84 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 2, 20]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -1} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.softmax(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 2} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py new file mode 100644 index 0000000000000..59af3a3d6ac17 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_split_op_ipu.py @@ -0,0 +1,113 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 10, 10]) + + self.feed_fp32 = {'x': data1.astype(np.float32)} + self.feed_fp16 = {'x': data1.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {"num_or_sections": [1, 1, 1], "axis": 1} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.split(x, **self.attrs) + + fetch_list = [fetch.name for fetch in out] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if (mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled + ) or mode == ExecutionMode.IPU_POPART_FP16: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"num_or_sections": [2, 8], "axis": 2} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py index ccd2796590838..bdc8fb32c8472 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_squeeze_op_ipu.py @@ -13,16 +13,11 @@ # limitations under the License. import unittest + import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,81 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 1, 5]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 1, 5]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [0]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.squeeze(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, - iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": []} class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [-2]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py index 3d5de11b5e213..c807ab9aab65e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_stack_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,86 +26,102 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 2]).astype('float32'), - "z": np.random.uniform(size=[1, 2]).astype('float32'), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[1, 2]) + y = np.random.uniform(size=[1, 2]) + z = np.random.uniform(size=[1, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + "z": z.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + "z": z.astype(np.float16) } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') z = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='float32') + out = paddle.fluid.layers.stack([x, y, z], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axis": -2} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py index 003350cd7a01e..12351cb63d6c8 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_sum_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,131 +26,154 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 2, 2]) + y = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.sum([x, y], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) @unittest.skip('') class TestCase1(TestBase): def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "z": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), + x = np.random.uniform(size=[1, 3, 2, 2]) + y = np.random.uniform(size=[1, 3, 2, 2]) + z = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + "z": y.astype(np.float32) + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + "z": y.astype(np.float16) } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') z = paddle.static.data( name=self.feed_list[2], shape=self.feed_shape[2], - dtype=self.feed_dtype[2]) + dtype='float32') + out = paddle.fluid.layers.sum([x, y, z], **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, iipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py index 9915a7a1fd89f..ef75aee78049b 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_topk_op_ipu.py @@ -16,130 +16,125 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestTopKOp(IPUOpTest): def setUp(self): - self.set_ops() self.set_atol() self.set_training() - self.k = 3 - self.use_K_as_const_variable = False - - self.set_feed() - self.set_attrs() - - def set_ops(self): - self.ops = [ - paddle.fluid.layers.topk, - paddle.topk # use top_k_v2 implementation - ] - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([3, 5]) - - self.feed = {} - self.feed_list = [] - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - self.feed_list.append("in_0") - if self.use_K_as_const_variable: - # self.feed["in_1"] = np.array([self.k]).astype("int32") - # self.feed_list.append("in_1") - pass - print("[TestTopKop] feed data:\n%s" % self.feed["in_0"]) - - def set_attrs(self): - self.attrs = { - # "axis": -1, - # "sorted": True - } - if not self.use_K_as_const_variable: - self.attrs["k"] = self.k - - def _test_base(self, run_ipu=True, op=None, data_feed=None): - assert (op is not None) - assert (data_feed is not None) - scope = fluid.core.Scope() + self.set_data_feed() + self.set_feed_attr() + self.set_test_op() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_test_op(self): + self.op = paddle.fluid.layers.topk + + def set_data_feed(self): + data = np.random.uniform(size=[3, 5]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.use_k_as_const_variable = False + self.attrs = {} + if not self.use_k_as_const_variable: + self.attrs["k"] = 3 + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - if not self.use_K_as_const_variable: - topk_values, topk_indices = op(x, **self.attrs) + + if not self.use_k_as_const_variable: + topk_values, topk_indices = self.op(x, **self.attrs) else: # !important, popart cannot accept non const tensor - # K_t = paddle.static.data(name="in_1", shape=[1], dtype='int32') - K_t = fluid.layers.fill_constant( + K_t = paddle.fluid.layers.fill_constant( shape=[1], dtype='int32', value=self.k, name="in_2") - topk_values, topk_indices = op(x, K_t, **self.attrs) + topk_values, topk_indices = self.op(x, K_t, **self.attrs) + fetch_list = [topk_values.name, topk_indices.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - print("Running inference ...") - result = exe.run(program, feed=data_feed, fetch_list=fetch_list) - print("Complete running infrence.") + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result def test_base(self): - for op in self.ops: - res0_topk_values, res0_topk_indices = self._test_base( - True, op=op, data_feed=self.feed) - res1_topk_values, res1_topk_indices = self._test_base( - False, op=paddle.fluid.layers.topk, data_feed=self.feed) - - print("[TestTopKop] IPU res0 values:\n%s\n" % res0_topk_values) - print("[TestTopKop] CPU res1 values:\n%s\n" % res1_topk_values) - view_type = np.uint32 - print("[TestTopKop] IPU res0 indices:\n%s\n" % - res0_topk_indices.astype(view_type)) - print("[TestTopKop] CPU res1 indices:\n%s\n" % res1_topk_indices) - - self.assertTrue( - np.allclose( - res0_topk_values.flatten(), - res1_topk_values.flatten(), - atol=self.atol)) - - self.assertTrue( - np.allclose( - res0_topk_indices.astype(view_type).flatten(), - res1_topk_indices.flatten(), - atol=self.atol)) + value_dict = {} + index_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + value, index = self._test_base(mode) + value_dict[mode] = value + index_dict[mode] = index + + self.check(value_dict) + self.check(index_dict) + + +class TestCase2(TestTopKOp): + def set_test_op(self): + self.op = paddle.topk + + +@unittest.skip("Trying to get data as int64 but it is of type int32") +class TestCase3(TestTopKOp): + def set_op_attrs(self): + self.use_k_as_const_variable = True + self.attrs = {} + self.k = 2 + + +@unittest.skip("Trying to get data as int64 but it is of type int32") +class TestCase4(TestCase3): + def set_test_op(self): + self.op = paddle.topk if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py index 77d2f41310149..1747bde20b6a6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_transpose_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,86 +26,94 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [0, 2, 3, 1]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.transpose(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [0, 1, 2, 3]} class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2, 3, 4, 5]).astype('float32'), - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 2, 3, 4, 5]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"perm": [4, 0, 2, 3, 1]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py index 75ed5a07315c7..e068c2e3b5908 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_unsqueeze_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,79 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[1, 2, 3]).astype('float32')} + def set_data_feed(self): + data = np.random.uniform(size=[1, 2, 3]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": 0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.unsqueeze(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": -1} class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"axes": [1, 2]} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py similarity index 79% rename from python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py rename to python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py index fabad936decb9..5cc62432dc635 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_varname_inplace_ipu.py @@ -16,15 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -from paddle.fluid.executor import global_scope -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -33,11 +26,11 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): + def set_data_feed(self): self.feed = { "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), } @@ -45,25 +38,22 @@ def set_feed(self): def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "shape": [30, 10], "inplace": True, } def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -76,12 +66,13 @@ def _test_base(self, run_ipu=True): scale2 = paddle.fluid.layers.scale(scale1, scale=1.3, bias=0.5) scale3 = paddle.fluid.layers.scale(scale2, scale=2, bias=0.7) - fetch_list = [scale3.name] + fetch_list = [scale3.name] if run_ipu: place = paddle.IPUPlace() else: place = paddle.CPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) scale1_out = main_prog.global_block().ops[4].output("Out")[0] @@ -92,8 +83,8 @@ def _test_base(self, run_ipu=True): if run_ipu: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py new file mode 100644 index 0000000000000..ecf1c61f52e83 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_sharing_ipu.py @@ -0,0 +1,126 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestWeightSharing(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + x = np.random.randint(0, 768, size=(128, 1)).astype(np.int32) + self.feed_cpu = {"x": x.astype(np.int64)} + self.feed_ipu = { + "x": np.tile(x.astype(np.int64)[np.newaxis, :], [3, 1, 1]) + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_cpu.values()] + self.feed_list = list(self.feed_cpu.keys()) + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='int64') + + with paddle.static.ipu_shard_guard(index=0, stage=0): + y = paddle.fluid.layers.embedding( + input=x, + size=[768, 768], + dtype='float32', + param_attr=paddle.fluid.ParamAttr( + name='word_embedding'), + is_sparse=False) + + with paddle.static.ipu_shard_guard(index=1, stage=1): + z = paddle.fluid.layers.fc( + input=y, + size=768, + param_attr=paddle.fluid.ParamAttr(name="fc")) + + with paddle.static.ipu_shard_guard(index=0, stage=2): + out = paddle.fluid.layers.matmul( + x=z, + y=main_prog.global_block().var('word_embedding'), + transpose_y=True) + + fetch_list = [out.name] + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config( + num_ipus=2, + is_training=self.is_training, + enable_manual_shard=True) + ipu_strategy.set_pipelining_config( + enable_pipelining=True, batches_per_step=3) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_ipu if run_ipu else self.feed_cpu + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + res0 = self._test_base(False) + res1 = self._test_base(True) + + self.assertTrue( + np.allclose( + res0.flatten(), res1[0].flatten(), atol=self.atol)) + + +if __name__ == "__main__": + unittest.main() From 47d1d5af242c49e36520d2fd04abcac2715fe6f4 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Tue, 8 Mar 2022 14:31:27 +0800 Subject: [PATCH 24/50] [PHI] Support string type attr in yaml (#40218) * support str attr in yaml * fix bug --- .../final_state_generator/eager_gen.py | 4 ++-- python/paddle/utils/code_gen/api.yaml | 4 ++-- python/paddle/utils/code_gen/api_base.py | 14 ++++++++------ python/paddle/utils/code_gen/sparse_api.yaml | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d1e208541537c..81d0c9b7bed59 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -25,10 +25,10 @@ yaml_types_mapping = { - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 699e42f23732a..8c68ca4d7e0e4 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -121,7 +121,7 @@ backward : matmul_grad - api : mean - args : (Tensor x, int64_t[] axis={}, bool keep_dim=false) + args : (Tensor x, int64[] axis={}, bool keep_dim=false) output : Tensor infer_meta : func : MeanInferMeta @@ -181,7 +181,7 @@ func : subtract - api : sum - args : (Tensor x, int64_t[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) + args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) output : Tensor infer_meta : func : SumInferMeta diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 601248a417639..68127fb522c3d 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -89,18 +89,20 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): attr_types_map = { 'ScalarArray': 'const ScalarArray&', 'Scalar': 'const Scalar&', + 'uint8': 'uint8_t', 'int': 'int', - 'int32_t': 'int32_t', - 'int64_t': 'int64_t', + 'int32': 'int32_t', + 'int64': 'int64_t', 'long': 'long', 'size_t': 'size_t', 'float': 'float', 'double': 'double', 'bool': 'bool', + 'str': 'const std::string&', 'Backend': 'Backend', 'DataLayout': 'DataLayout', 'DataType': 'DataType', - 'int64_t[]': 'const std::vector&', + 'int64[]': 'const std::vector&', 'int[]': 'const std::vector&', 'long[]': 'const std::vector&' } @@ -110,8 +112,8 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): 'ScalarArray': 'const paddle::optional&', 'Scalar': 'const paddle::optional&', 'int': 'paddle::optional', - 'int32_t': 'paddle::optional', - 'int64_t': 'paddle::optional', + 'int32': 'paddle::optional', + 'int64': 'paddle::optional', 'size_t': 'paddle::optional', 'float': 'paddle::optional', 'double': 'paddle::optional', @@ -119,7 +121,7 @@ def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): 'Backend': 'paddle::optional', 'DataLayout': 'paddle::optional', 'DataType': 'paddle::optional', - 'int64_t[]': 'paddle::optional>', + 'int64[]': 'paddle::optional>', 'int[]': 'paddle::optional>' } diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml index 135989121cca6..b531c2ed9ce51 100644 --- a/python/paddle/utils/code_gen/sparse_api.yaml +++ b/python/paddle/utils/code_gen/sparse_api.yaml @@ -11,7 +11,7 @@ invoke : to_dense_impl(x, backend) - sparse_api : to_sparse_coo - args : (Tensor x, Backend backend, int64_t sparse_dim) + args : (Tensor x, Backend backend, int64 sparse_dim) output : Tensor(out@SparseCooTensor) invoke : to_sparse_coo_impl(x, backend, sparse_dim) From f1fe2ad45d2b4cd013ce83194192b1fb7bc72957 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Tue, 8 Mar 2022 14:33:28 +0800 Subject: [PATCH 25/50] add support for concat and variadic tensor list (#40229) --- .../paddle/fluid/tests/unittests/op_test.py | 23 +++++++++++-------- .../fluid/tests/unittests/test_concat_op.py | 1 + 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 0c7f269a087b8..6455da924757b 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -722,13 +722,17 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): def assumption_assert_and_transform(args, argvs): """ - currently only support "X" is [Tensor], don't support multi-tensor in "X" + transform by the following rules: + 1. [Tensor] -> Tensor + 2. [Tensor, Tensor, ...] -> list of Tensors + + only support "X" is list of Tensor, currently don't support other structure like dict. """ for inp in args: - assert isinstance(inp, list) and len( - inp - ) == 1, "currently only support `X` is [Tensor], don't support multi-tensor in `X`" - args = [inp[0] for inp in args] + assert isinstance( + inp, list + ), "currently only support `X` is [Tensor], don't support other structure." + args = [inp[0] if len(inp) == 1 else inp for inp in args] return args, argvs def cal_python_api(python_api, args, argvs, kernel_sig): @@ -1239,15 +1243,16 @@ def check_output_with_place(self, dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) + if check_eager: + with _test_eager_guard(): + eager_dygraph_outs = self._calc_dygraph_output( + place, no_check_set=no_check_set) + # we only check end2end api when check_eager=True if hasattr(self, "python_api"): api_outs = self._calc_python_api_output(place) self._check_api_outs_by_dygraph_outs(api_outs, dygraph_outs, place) - if check_eager: - with _test_eager_guard(): - eager_dygraph_outs = self._calc_dygraph_output( - place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) for out_name, out_dup in Operator.get_op_outputs(self.op_type): diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 10b7e13dcc334..4feca1b92505b 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -25,6 +25,7 @@ class TestConcatOp(OpTest): def setUp(self): self.op_type = "concat" + self.python_api = paddle.concat self.dtype = self.get_dtype() self.init_test_data() self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} From 975f99ab012310e97cbdee44bb25a05ad7bad012 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 8 Mar 2022 14:54:30 +0800 Subject: [PATCH 26/50] [Phi]Move Relu/Cos/Sin/Tan/Acos/Asin/Atan/Sinh/Cosh/Asinh/Acosh/Atanh kernels in Activation to Phi (#40175) * move activation op * adjust code format * fix compile bugs * fix ci bugs * code format adjust * code format adjust2 * activate ci status * modify according to comment --- cmake/operators.cmake | 2 +- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- .../paddle2cinn/build_cinn_pass_test.cc | 4 +- .../paddle2cinn/cinn_compiler_test.cc | 2 +- .../fluid/imperative/tests/test_prepare_op.cc | 2 +- .../tensorrt/convert/test_activation_op.cc | 2 +- .../fluid/operators/activation_cudnn_op.cu.cc | 19 +- paddle/fluid/operators/activation_op.cc | 43 +- paddle/fluid/operators/activation_op.h | 590 +++---------- paddle/fluid/operators/activation_op.kps | 454 ++-------- .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- .../mkldnn/test_mkldnn_op_inplace.cc | 2 +- .../operators/mkldnn/test_mkldnn_op_nhwc.cc | 2 +- .../operators/mlu/activation_op_mlu_test.cc | 2 +- .../test_common_infer_shape_functions.cc | 2 +- paddle/phi/kernels/activation_grad_kernel.h | 55 ++ paddle/phi/kernels/activation_kernel.h | 40 + .../phi/kernels/cpu/activation_grad_kernel.cc | 91 ++ paddle/phi/kernels/cpu/activation_kernel.cc | 55 ++ paddle/phi/kernels/funcs/activation_functor.h | 830 ++++++++++++++++++ .../phi/kernels/gpu/activation_grad_kernel.cu | 221 +++++ paddle/phi/kernels/gpu/activation_kernel.cu | 143 +++ .../phi/kernels/impl/activation_grad_impl.h | 133 +++ paddle/phi/kernels/impl/activation_impl.h | 50 ++ paddle/phi/ops/compat/activation_sig.cc | 67 ++ 25 files changed, 1908 insertions(+), 907 deletions(-) create mode 100644 paddle/phi/kernels/activation_grad_kernel.h create mode 100644 paddle/phi/kernels/activation_kernel.h create mode 100644 paddle/phi/kernels/cpu/activation_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/activation_kernel.cc create mode 100644 paddle/phi/kernels/funcs/activation_functor.h create mode 100644 paddle/phi/kernels/gpu/activation_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/activation_kernel.cu create mode 100644 paddle/phi/kernels/impl/activation_grad_impl.h create mode 100644 paddle/phi/kernels/impl/activation_impl.h create mode 100644 paddle/phi/ops/compat/activation_sig.cc diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 9e8c81c2985b7..1291e60cfe4ce 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852dd..796aa4039c9e8 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -27,7 +27,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf394f..47dffd47b7cbb 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9b9..cdccc4c554690 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99ad3..17cbe06748234 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -226,7 +226,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c2c..7f7313fbcb596 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); USE_OP(tanh); USE_OP(relu6); diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ada7..b4a97e24cf292 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7e0..66f1bcc8b6869 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,18 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1584,16 +1603,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7bb6..4b79397b6cdf2 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; +template +using ReluGradFunctor = phi::funcs::ReluGradFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +template +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* Out @@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { static_cast(2) * out * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // tanhshrink(x) = x - tanh(x) @@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x > th).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; /* @@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor( "Cannot get the tensor from the Variable Output, variable name = %s", ctx.OutputName("DDX"))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NOT_NULL( x_var, platform::errors::NotFound( @@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor( VLOG(10) << "Inplace activation of Op: " << ctx.Type(); *X = *ddX; } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { auto out_var = ctx.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, @@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 3b7ce9eaf2bea..208abd0949aa8 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,28 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - template struct CudaLeakyReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { return x > zero ? dout : static_cast(alpha) * dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { return dout * (one - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } }; -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); - } -}; - -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); - } -}; - -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { return (x > t_min_cast && x < t_max_cast) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { return x > static_cast(threshold) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1602,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1838,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL( __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ - __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor); \ - __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); \ - __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); \ - __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor); \ - __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); \ - __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor); \ - __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor); \ - __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor); \ - __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor); \ - __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor); \ - __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor); \ __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \ __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ CudaReciprocalGradFunctor); \ diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 05cd264cf3ec9..23428dd403e9b 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c792..e9dadd5ec937c 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,7 +27,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 3791fed23a84f..916f02179b364 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -27,7 +27,7 @@ USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 884521301750c..6e3bd5e43c9c1 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58af6..1de1b590a1311 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h new file mode 100644 index 0000000000000..f34e5710ab729 --- /dev/null +++ b/paddle/phi/kernels/activation_grad_kernel.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +#define DECLARE_ACTIVATION_GRAD_KERNEL_DepX(name) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx); + +#define DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(name) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx); + +template +void ReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + DenseTensor* ddout); + +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cos); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Tan); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acos); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sin); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asin); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atan); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Sinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Cosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Asinh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Acosh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepX(Atanh); +DECLARE_ACTIVATION_GRAD_KERNEL_DepOut(Relu); + +} // namespace phi diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h new file mode 100644 index 0000000000000..bdf8f4363598f --- /dev/null +++ b/paddle/phi/kernels/activation_kernel.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" + +namespace phi { + +#define DECLARE_ACTIVATION_KERNEL(name) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); + +DECLARE_ACTIVATION_KERNEL(Cos) +DECLARE_ACTIVATION_KERNEL(Tan) +DECLARE_ACTIVATION_KERNEL(Acos) +DECLARE_ACTIVATION_KERNEL(Sin) +DECLARE_ACTIVATION_KERNEL(Asin) +DECLARE_ACTIVATION_KERNEL(Atan) +DECLARE_ACTIVATION_KERNEL(Sinh) +DECLARE_ACTIVATION_KERNEL(Cosh) +DECLARE_ACTIVATION_KERNEL(Asinh) +DECLARE_ACTIVATION_KERNEL(Acosh) +DECLARE_ACTIVATION_KERNEL(Atanh) +DECLARE_ACTIVATION_KERNEL(Relu) + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc new file mode 100644 index 0000000000000..fe43ebb816077 --- /dev/null +++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +namespace phi { + +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradImpl( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradImpl( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::TanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::AcosGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::SinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::AsinGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::AtanGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::SinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::AsinhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::AcoshGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::AtanhGradFunctor); +DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::ReluGradFunctor); + +} // namespace phi + +PD_REGISTER_KERNEL( + cos_grad, CPU, ALL_LAYOUT, phi::CosGradKernel, float, double) {} +PD_REGISTER_KERNEL( + tan_grad, CPU, ALL_LAYOUT, phi::TanGradKernel, float, double) {} +PD_REGISTER_KERNEL( + acos_grad, CPU, ALL_LAYOUT, phi::AcosGradKernel, float, double) {} +PD_REGISTER_KERNEL( + sin_grad, CPU, ALL_LAYOUT, phi::SinGradKernel, float, double) {} +PD_REGISTER_KERNEL( + asin_grad, CPU, ALL_LAYOUT, phi::AsinGradKernel, float, double) {} +PD_REGISTER_KERNEL( + atan_grad, CPU, ALL_LAYOUT, phi::AtanGradKernel, float, double) {} +PD_REGISTER_KERNEL( + sinh_grad, CPU, ALL_LAYOUT, phi::SinhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + cosh_grad, CPU, ALL_LAYOUT, phi::CoshGradKernel, float, double) {} +PD_REGISTER_KERNEL( + asinh_grad, CPU, ALL_LAYOUT, phi::AsinhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + acosh_grad, CPU, ALL_LAYOUT, phi::AcoshGradKernel, float, double) {} +PD_REGISTER_KERNEL( + atanh_grad, CPU, ALL_LAYOUT, phi::AtanhGradKernel, float, double) {} +PD_REGISTER_KERNEL( + relu_grad, CPU, ALL_LAYOUT, phi::ReluGradKernel, float, double) {} +PD_REGISTER_KERNEL(relu_double_grad, + CPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc new file mode 100644 index 0000000000000..51883f25183af --- /dev/null +++ b/paddle/phi/kernels/cpu/activation_kernel.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/activation_impl.h" + +namespace phi { + +#define DEFINE_CPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + ActivationImpl(dev_ctx, x, out, functor); \ + } + +DEFINE_CPU_ACTIVATION_KERNEL(Sin, funcs::SinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cos, funcs::CosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Tan, funcs::TanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asin, funcs::AsinFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atan, funcs::AtanFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acos, funcs::AcosFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Sinh, funcs::SinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Cosh, funcs::CoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Asinh, funcs::AsinhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Acosh, funcs::AcoshFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Atanh, funcs::AtanhFunctor) +DEFINE_CPU_ACTIVATION_KERNEL(Relu, funcs::ReluCPUFunctor) + +} // namespace phi +PD_REGISTER_KERNEL(sin, CPU, ALL_LAYOUT, phi::SinKernel, float, double) {} +PD_REGISTER_KERNEL(cos, CPU, ALL_LAYOUT, phi::CosKernel, float, double) {} +PD_REGISTER_KERNEL(tan, CPU, ALL_LAYOUT, phi::TanKernel, float, double) {} +PD_REGISTER_KERNEL(acos, CPU, ALL_LAYOUT, phi::AcosKernel, float, double) {} +PD_REGISTER_KERNEL(asin, CPU, ALL_LAYOUT, phi::AsinKernel, float, double) {} +PD_REGISTER_KERNEL(atan, CPU, ALL_LAYOUT, phi::AtanKernel, float, double) {} +PD_REGISTER_KERNEL(sinh, CPU, ALL_LAYOUT, phi::SinhKernel, float, double) {} +PD_REGISTER_KERNEL(cosh, CPU, ALL_LAYOUT, phi::CoshKernel, float, double) {} +PD_REGISTER_KERNEL(asinh, CPU, ALL_LAYOUT, phi::AsinhKernel, float, double) {} +PD_REGISTER_KERNEL(acosh, CPU, ALL_LAYOUT, phi::AcoshKernel, float, double) {} +PD_REGISTER_KERNEL(atanh, CPU, ALL_LAYOUT, phi::AtanhKernel, float, double) {} +PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h new file mode 100644 index 0000000000000..1a36e4e132f41 --- /dev/null +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -0,0 +1,830 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif + +#include + +#include "paddle/phi/common/amp_type_traits.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { +namespace funcs { +enum ActBwdOpFwdDeps { + kNoDeps = 0x00, // Do not need any forward input/output + kDepX = 0x01, // Only need forward input X + kDepOut = 0x02, // Only need forward output Out +}; + +template +struct BaseActivationFunctor { + using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } +}; + +template +struct Sine { + HOSTDEVICE T operator()(const T& val) const { return sin(val); } +}; + +template <> +struct Sine { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(sin(static_cast(val))); + } +}; + +template +struct Cosine { + HOSTDEVICE T operator()(const T& val) const { return cos(val); } +}; + +template <> +struct Cosine { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(cos(static_cast(val))); + } +}; + +// sine'(x) = cos(x) +template +struct SinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// sine(x) = sin(x) +template +struct SinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sine()); + } +}; + +// cosine'(x) = -sin(x) +template +struct CosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = -dout * x.unaryExpr(Sine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// cosine(x) = cos(x) +template +struct CosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosine()); + } +}; + +template +struct Tangent { + HOSTDEVICE T operator()(const T& val) const { return tan(val); } +}; + +template <> +struct Tangent { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(tan(static_cast(val))); + } +}; + +// Tangent'(x) = -Tangent(x) +template +struct TanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout / x.unaryExpr(Cosine()).square(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// Tangent(x) = tan(x) +template +struct TanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Tangent()); + } +}; + +template +struct Sinh { + HOSTDEVICE T operator()(const T& val) const { return sinh(val); } +}; + +template <> +struct Sinh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(sinhf(static_cast(val))); + } +}; + +template +struct Cosh { + HOSTDEVICE T operator()(const T& val) const { return cosh(val); } +}; + +template <> +struct Cosh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(coshf(static_cast(val))); + } +}; + +// sinh(x) = sinh(x) +template +struct SinhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sinh()); + } +}; + +// cosh(x) = cosh(x) +template +struct CoshFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosh()); + } +}; + +// sinh'(x) = cosh(x) +template +struct SinhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// cosh'(x) = sinh(x) +template +struct CoshGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Sinh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Acosh { + HOSTDEVICE T operator()(const T& val) const { return acosh(val); } +}; + +template <> +struct Acosh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(acosh(static_cast(val))); + } +}; + +// Acosh(x) = acosh(x) +template +struct AcoshFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acosh()); + } +}; + +// acosh'(x) = 1/sqrt(x^2 - 1) +template +struct AcoshGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Asinh { + HOSTDEVICE T operator()(const T& val) const { return asinh(val); } +}; + +template <> +struct Asinh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(asinh(static_cast(val))); + } +}; + +// Asinh(x) = asinh(x) +template +struct AsinhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asinh()); + } +}; + +// asinh'(x) = 1/sqrt(x^2 + 1) +template +struct AsinhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Atanh { + HOSTDEVICE T operator()(const T& val) const { return atanh(val); } +}; + +template <> +struct Atanh { + HOSTDEVICE dtype::float16 operator()(const dtype::float16& val) const { + return dtype::float16(atanh(static_cast(val))); + } +}; + +// Atanh(x) = atanh(x) +template +struct AtanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atanh()); + } +}; + +// atanh'(x) = 1/(1 - x^2) +template +struct AtanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// relu(x) = max(x, 0) +template +struct ReluCPUFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { + return v > static_cast(0) ? v : static_cast(0); + }); + } +}; + +template +struct ReluCUDAFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)); + } +}; + +template +struct ReluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (out > static_cast(0)).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct ReluGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* ddOut, + DenseTensor* dOut, + DenseTensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); + auto out = EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); + if (ddOut) { + auto ddout = EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); + ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +#if defined(__NVCC__) || defined(__HIPCC__) +template +struct CudaReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // relu(x) = max(x, 0) + __device__ __forceinline__ T operator()(const T x) const { + return x > zero ? x : zero; + } +}; + +template +struct CudaReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // dx = dout * (out > 0) + __device__ __forceinline__ T operator()(const T dout, const T out) const { + return out > zero ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } +}; + +template +struct CudaCosFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // cos(x) = cos(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(cos(x)); + } +}; + +template +struct CudaCosGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * (-sin(x)) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(-dout * sin(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSinFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // sin(x) = sin(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(sin(x)); + } +}; + +template +struct CudaSinGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * cos(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * cos(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaTanFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // tan(x) = tan(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(tan(x)); + } +}; + +template +struct CudaTanGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout / cos(x)^2 + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout / (cos(x) * cos(x))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAsinFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // asin(x) = asin(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(asin(x)); + } +}; + +template +struct CudaAsinGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout / sqrt(1 - x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAcosFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // acos(x) = acos(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(acos(x)); + } +}; + +template +struct CudaAcosGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = -dout / sqrt(1 - x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(-dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaCoshFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // cosh(x) = cosh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(cosh(x)); + } +}; + +template +struct CudaCoshGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * sinh(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * sinh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaSinhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // sinh(x) = sinh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(sinh(x)); + } +}; + +template +struct CudaSinhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // dx = dout * cosh(x) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * cosh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAcoshFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Acosh(x) = acosh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(acosh(x)); + } +}; + +template +struct CudaAcoshGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + // dx = dout * 1 / sqrt(x^2 - 1) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / sqrt(x * x - one)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAsinhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Asinh(x) = asinh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(asinh(x)); + } +}; + +template +struct CudaAsinhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * 1/sqrt(x^2 + 1) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / sqrt(x * x + one)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAtanhFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // Atanh(x) = atanh(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(atanh(x)); + } +}; + +template +struct CudaAtanhGradFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + // dx = dout * 1/(1- x^2) + __device__ __forceinline__ T operator()(const T arg_dout, + const T arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + return static_cast(dout * one / (one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +template +struct CudaAtanFunctor : public BaseActivationFunctor { + using MPType = typename phi::dtype::MPTypeTrait::Type; + + // atan(x) = atan(x) + __device__ __forceinline__ T operator()(const T arg_x) const { + MPType x = static_cast(arg_x); + return static_cast(atan(x)); + } +}; + +template +struct CudaAtanGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x^2) + __device__ __forceinline__ T operator()(const T dout, const T x) const { + return dout / (one + x * x); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu new file mode 100644 index 0000000000000..c2995c79a7e8c --- /dev/null +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -0,0 +1,221 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +namespace phi { + +template +void ActivationGradGPUImpl(const Context& dev_ctx, + const DenseTensor* x, + const DenseTensor* out, + const DenseTensor* d_out, + DenseTensor* d_x, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } + PADDLE_ENFORCE_NOT_NULL( + d_out, errors::NotFound("The input DenseTensor dOut can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + d_x, errors::NotFound("The output DenseTensor dX can not be nullptr")); + if (!out) { + out = d_out; // fake out + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + x, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + x = d_x; + } + + dev_ctx.template Alloc(d_x); + + std::vector ins = {d_out}; + std::vector outs = {d_x}; + + if (static_cast(Functor::FwdDeps()) == + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + // Only need forward output Out + ins.push_back(out); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + // Only need forward input X + ins.push_back(x); + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } else { + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); + } +} + +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradGPUImpl( \ + dev_ctx, &x, nullptr, &dout, dx, functor); \ + } + +#define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(name, functor_class) \ + template \ + void name##GradKernel(const Context& dev_ctx, \ + const DenseTensor& out, \ + const DenseTensor& dout, \ + DenseTensor* dx) { \ + functor_class functor; \ + ActivationGradGPUImpl( \ + dev_ctx, nullptr, &out, &dout, dx, functor); \ + } + +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepOut(Relu, funcs::CudaReluGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cos, funcs::CudaCosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Tan, funcs::CudaTanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acos, funcs::CudaAcosGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sin, funcs::CudaSinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asin, funcs::CudaAsinGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atan, funcs::CudaAtanGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Sinh, funcs::CudaSinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Cosh, funcs::CudaCoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Asinh, funcs::CudaAsinhGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Acosh, funcs::CudaAcoshGradFunctor); +DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DepX(Atanh, funcs::CudaAtanhGradFunctor); + +} // namespace phi +PD_REGISTER_KERNEL(cos_grad, + GPU, + ALL_LAYOUT, + phi::CosGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(tan_grad, + GPU, + ALL_LAYOUT, + phi::TanGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acos_grad, + GPU, + ALL_LAYOUT, + phi::AcosGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sin_grad, + GPU, + ALL_LAYOUT, + phi::SinGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asin_grad, + GPU, + ALL_LAYOUT, + phi::AsinGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atan_grad, + GPU, + ALL_LAYOUT, + phi::AtanGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sinh_grad, + GPU, + ALL_LAYOUT, + phi::SinhGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(cosh_grad, + GPU, + ALL_LAYOUT, + phi::CoshGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asinh_grad, + GPU, + ALL_LAYOUT, + phi::AsinhGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acosh_grad, + GPU, + ALL_LAYOUT, + phi::AcoshGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atanh_grad, + GPU, + ALL_LAYOUT, + phi::AtanhGradKernel, + float, + double, + phi::dtype::float16) {} +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(relu_grad, + GPU, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(relu_double_grad, + GPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(relu_grad, + GPU, + ALL_LAYOUT, + phi::ReluGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(relu_double_grad, + GPU, + ALL_LAYOUT, + phi::ReluDoubleGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu new file mode 100644 index 0000000000000..26752b89e7c34 --- /dev/null +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -0,0 +1,143 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/activation_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/bfloat16.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/impl/activation_grad_impl.h" + +#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" + +namespace phi { + +template +void ActivationGPUImpl(const Context& dev_ctx, + const DenseTensor& x, + DenseTensor* out, + const Functor& functor) { + PADDLE_ENFORCE_NOT_NULL(out, + errors::NotFound("Output Out should not be nullptr")); + dev_ctx.template Alloc(out); + std::vector ins = {&x}; + std::vector outs = {out}; + funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} + +#define DEFINE_GPU_ACTIVATION_KERNEL(name, functor_class) \ + template \ + void name##Kernel( \ + const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { \ + functor_class functor; \ + ActivationGPUImpl(dev_ctx, x, out, functor); \ + } + +DEFINE_GPU_ACTIVATION_KERNEL(Cos, funcs::CudaCosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Tan, funcs::CudaTanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acos, funcs::CudaAcosFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sin, funcs::CudaSinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asin, funcs::CudaAsinFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atan, funcs::CudaAtanFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Sinh, funcs::CudaSinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Cosh, funcs::CudaCoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Asinh, funcs::CudaAsinhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Acosh, funcs::CudaAcoshFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Atanh, funcs::CudaAtanhFunctor) +DEFINE_GPU_ACTIVATION_KERNEL(Relu, funcs::CudaReluFunctor) + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(relu, + GPU, + ALL_LAYOUT, + phi::ReluKernel, + float, + double, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(relu, + GPU, + ALL_LAYOUT, + phi::ReluKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif +PD_REGISTER_KERNEL( + sin, GPU, ALL_LAYOUT, phi::SinKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL( + cos, GPU, ALL_LAYOUT, phi::CosKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL( + tan, GPU, ALL_LAYOUT, phi::TanKernel, float, double, phi::dtype::float16) {} +PD_REGISTER_KERNEL(acos, + GPU, + ALL_LAYOUT, + phi::AcosKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asin, + GPU, + ALL_LAYOUT, + phi::AsinKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atan, + GPU, + ALL_LAYOUT, + phi::AtanKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(sinh, + GPU, + ALL_LAYOUT, + phi::SinhKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(cosh, + GPU, + ALL_LAYOUT, + phi::CoshKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(asinh, + GPU, + ALL_LAYOUT, + phi::AsinhKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(acosh, + GPU, + ALL_LAYOUT, + phi::AcoshKernel, + float, + double, + phi::dtype::float16) {} +PD_REGISTER_KERNEL(atanh, + GPU, + ALL_LAYOUT, + phi::AtanhKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h new file mode 100644 index 0000000000000..80e23d2b8e24b --- /dev/null +++ b/paddle/phi/kernels/impl/activation_grad_impl.h @@ -0,0 +1,133 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace phi { + +template +void ActivationGradImpl(const Context& dev_ctx, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* dOut, + DenseTensor* dX, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + Out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } + PADDLE_ENFORCE_NOT_NULL( + dOut, errors::NotFound("The input DenseTensor dOut can not be nullptr")); + PADDLE_ENFORCE_NOT_NULL( + dX, errors::NotFound("The output DenseTensor dX can not be nullptr")); + if (!Out) { + Out = dOut; // fake out + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + X, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + X = dX; + } + + dev_ctx.template Alloc(dX); + auto dout = phi::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "Out@GRAD", "ActivationGrad")); + auto out = phi::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "ActivationGrad")); + auto dx = phi::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Input", "X@GRAD", "ActivationGrad")); + auto x = phi::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "ActivationGrad")); + auto* place = dev_ctx.eigen_device(); + // use 32bit index to speed up computation + bool use_32bit_index = out.size() < Eigen::NumTraits::highest(); + bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); + if (use_32bit_index && is_gpu_place) { + functor(*place, + To32BitIndex(x), + To32BitIndex(out), + To32BitIndex(dout), + To32BitIndex(dx)); + } else { + functor(*place, x, out, dout, dx); + } +} + +template +void ActivationDoubleGradImpl(const Context& dev_ctx, + const DenseTensor* X, + const DenseTensor* Out, + const DenseTensor* ddX, + DenseTensor* dX, + DenseTensor* dOut, + DenseTensor* ddOut, + const Functor& functor) { + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepX)) { + PADDLE_ENFORCE_NOT_NULL( + X, errors::NotFound("The input DenseTensor X can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + X = ddX; + } + if (static_cast(Functor::FwdDeps()) & + static_cast(funcs::ActBwdOpFwdDeps::kDepOut)) { + PADDLE_ENFORCE_NOT_NULL( + Out, errors::NotFound("The input DenseTensor Out can not be nullptr")); + } else { + VLOG(10) << "Inplace activation of Op Functor: " << typeid(Functor).name(); + Out = ddX; + } + + if (ddOut) { + dev_ctx.template Alloc(ddOut); + } + if (dOut) { + dev_ctx.template Alloc(dOut); + } + if (dX) { + dX->Resize(Out->dims()); + dev_ctx.template Alloc(dX); + } + + functor(dev_ctx, X, Out, ddX, ddOut, dOut, dX); +} + +template +void ReluDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& ddx, + DenseTensor* ddout) { + funcs::ReluGradGradFunctor relu_double_grad_functor; + ActivationDoubleGradImpl>( + dev_ctx, + nullptr, + &out, + &ddx, + nullptr, + nullptr, + ddout, + relu_double_grad_functor); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/activation_impl.h b/paddle/phi/kernels/impl/activation_impl.h new file mode 100644 index 0000000000000..ca3debd394a1e --- /dev/null +++ b/paddle/phi/kernels/impl/activation_impl.h @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/activation_functor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace phi { + +#define ToString(x) #x + +template +void ActivationImpl(const Context& dev_ctx, + const DenseTensor& X, + DenseTensor* Out, + const Functor& functor) { + PADDLE_ENFORCE_NOT_NULL(Out, + errors::NotFound("Output Out should not be nullptr")); + dev_ctx.template Alloc(Out); + auto x = phi::EigenVector::Flatten( + GET_DATA_SAFELY(&X, "Input", "X", "Activation")); + auto out = phi::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Output", "Out", "Activation")); + auto* place = dev_ctx.eigen_device(); + // use 32bit index to speed up computation + bool use_32bit_index = out.size() < Eigen::NumTraits::highest(); + bool is_gpu_place = paddle::platform::is_gpu_place(dev_ctx.GetPlace()); + if (use_32bit_index && is_gpu_place) { + functor(*place, To32BitIndex(x), To32BitIndex(out)); + } else { + functor(*place, x, out); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc new file mode 100644 index 0000000000000..396830ca20765 --- /dev/null +++ b/paddle/phi/ops/compat/activation_sig.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +#define DefineActGradDepXOpArgMap(func_name, op_name) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature( \ + op_name "_grad", {"X", GradVarName("Out")}, {}, {GradVarName("X")}); \ + } + +#define DefineActGradDepOutOpArgMap(func_name, op_name) \ + KernelSignature func_name##GradOpArgumentMapping( \ + const ArgumentMappingContext& ctx) { \ + return KernelSignature( \ + op_name "_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); \ + } + +KernelSignature ReluDoubleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("relu_double_grad", {"Out", "DDX"}, {}, {"DDOut"}); +} + +DefineActGradDepXOpArgMap(Cos, "cos"); +DefineActGradDepXOpArgMap(Tan, "tan"); +DefineActGradDepXOpArgMap(Acos, "acos"); +DefineActGradDepXOpArgMap(Sin, "sin"); +DefineActGradDepXOpArgMap(Asin, "asin"); +DefineActGradDepXOpArgMap(Atan, "atan"); +DefineActGradDepXOpArgMap(Sinh, "sinh"); +DefineActGradDepXOpArgMap(Cosh, "cosh"); +DefineActGradDepXOpArgMap(Asinh, "asinh"); +DefineActGradDepXOpArgMap(Acosh, "acosh"); +DefineActGradDepXOpArgMap(Atanh, "atanh"); +DefineActGradDepOutOpArgMap(Relu, "relu"); +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad); + +PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acos_grad, phi::AcosGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sin_grad, phi::SinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asin_grad, phi::AsinGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atan_grad, phi::AtanGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(sinh_grad, phi::SinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cosh_grad, phi::CoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(asinh_grad, phi::AsinhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(acosh_grad, phi::AcoshGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(atanh_grad, phi::AtanhGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu_grad, phi::ReluGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(relu_grad_grad, + phi::ReluDoubleGradOpArgumentMapping); From 7024ade70597962aad8e7f7cf77b174fa821ee13 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 8 Mar 2022 15:54:32 +0800 Subject: [PATCH 27/50] [Phi] Move matrix inverse into phi (#40237) * move matrix inverse into phi * change license year --- paddle/fluid/operators/determinant_op.h | 6 +- paddle/fluid/operators/inverse_op.h | 4 +- paddle/fluid/operators/math/CMakeLists.txt | 1 - paddle/fluid/operators/math/matrix_inverse.cc | 38 ----- .../fluid/operators/math/matrix_inverse.cu.cc | 124 --------------- paddle/fluid/operators/matrix_power_op.h | 6 +- paddle/phi/kernels/funcs/CMakeLists.txt | 1 + paddle/phi/kernels/funcs/matrix_inverse.cc | 37 +++++ paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 141 ++++++++++++++++++ .../kernels/funcs}/matrix_inverse.h | 41 ++--- 10 files changed, 208 insertions(+), 191 deletions(-) delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cc delete mode 100644 paddle/fluid/operators/math/matrix_inverse.cu.cc create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cc create mode 100644 paddle/phi/kernels/funcs/matrix_inverse.cu.cc rename paddle/{fluid/operators/math => phi/kernels/funcs}/matrix_inverse.h (61%) diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f474..463a707ccf15b 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,11 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -226,7 +226,7 @@ class DeterminantGradKernel : public framework::OpKernel { inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); @@ -381,7 +381,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel { inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50ae0..31c22915ec5d0 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index bce927c32ddf7..d5a86d62b417c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -46,7 +46,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) math_library(segment_pooling) math_library(matrix_solve) diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68df..0000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417a9..0000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h index d2c67d80b4f5a..8eb9c58513df6 100644 --- a/paddle/fluid/operators/matrix_power_op.h +++ b/paddle/fluid/operators/matrix_power_op.h @@ -18,9 +18,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -67,7 +67,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); } else { // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *X, &new_x); new_n = -n; } @@ -200,7 +200,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); } else { // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *X, &new_x); new_n = -n; } diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt index 02cba6009c400..f0fbb7bf0849b 100644 --- a/paddle/phi/kernels/funcs/CMakeLists.txt +++ b/paddle/phi/kernels/funcs/CMakeLists.txt @@ -9,3 +9,4 @@ math_library(gru_compute DEPS activation_functions math_function) math_library(lstm_compute DEPS activation_functions) math_library(concat_and_split_functor DEPS dense_tensor) math_library(matrix_reduce DEPS dense_tensor) +math_library(matrix_inverse DEPS dense_tensor eigen3 blas) diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc new file mode 100644 index 0000000000000..c95e97f8ea81a --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_inverse.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_inverse.h" + +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { +namespace funcs { + +template +void MatrixInverseFunctor::operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { + ComputeInverseEigen(dev_ctx, a, a_inv); +} + +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +// TODO(chenweihang): remove these instantiations later +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc new file mode 100644 index 0000000000000..686b8405bf750 --- /dev/null +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/funcs/matrix_inverse.h" + +#include "paddle/phi/kernels/funcs/blas/blas.h" + +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace phi { +namespace funcs { + +template +void MatrixInverseFunctor::operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { +#ifndef PADDLE_WITH_HIP + const auto& mat_dims = a.dims(); + const int rank = mat_dims.size(); + int n = mat_dims[rank - 1]; + int batch_size = rank > 2 ? a.numel() / (n * n) : 1; + + paddle::memory::allocation::AllocationPtr tmp_gpu_mat_data; + const T* gpu_mat = a.data(); + if (n >= 32) { + // Copy all elements of input matrix A to a temporary memory space to + // avoid being overriden by getrf. + tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_gpu_mat_data->ptr(), + dev_ctx.GetPlace(), + a.data(), + a.numel() * sizeof(T), + dev_ctx.stream()); + gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); + } + + std::vector cpu_ptrs(batch_size * 2); + for (int i = 0; i < batch_size; ++i) { + cpu_ptrs[i] = gpu_mat + i * n * n; + cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; + } + + // Copy the addresses of A and A_inv from host to device. + paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = + paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Copy(dev_ctx.GetPlace(), + tmp_gpu_ptrs_data->ptr(), + phi::CPUPlace(), + static_cast(cpu_ptrs.data()), + cpu_ptrs.size() * sizeof(T*), + dev_ctx.stream()); + T** gpu_inv_ptrs = + reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; + + // Allocate device memory for info and pivots. + int num_ints = n < 32 ? batch_size : batch_size * (n + 1); + paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = + paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int)); + int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); + + auto blas = phi::funcs::GetBlas(dev_ctx); + + std::vector info; // only for singular checking + info.resize(batch_size); + // This functions in cuBLAS is intended to be used for matrices of small + // sizes where the launch overhead is a significant factor. + // TODO(Xreki): call function in cusolver for large matrices. + if (n < 32) { + // cublasmatinvBatched is a short cut of cublasgetrfBatched + // plus cublasgetriBatched. + // However it only works if N is less than 32. If not, we need to + // go through cublasgetrfBatched and cublasgetriBatched. + blas.BatchedMatInv(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_inv_ptrs, + gpu_info_ptr, + batch_size); + } else { + // This function performs the LU factorization of each matrix A by the + // equation P * A = L * U. L and U are written back to original matrix A, + // and diagonal elements of L are discarded. + int* gpu_pivot_ptr = + reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; + blas.BatchedGETRF(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_pivot_ptr, + gpu_info_ptr, + batch_size); + + blas.BatchedGETRI(n, + reinterpret_cast(tmp_gpu_ptrs_data->ptr()), + gpu_pivot_ptr, + gpu_inv_ptrs, + gpu_info_ptr, + batch_size); + } + paddle::memory::Copy(phi::CPUPlace(), + info.data(), + dev_ctx.GetPlace(), + gpu_info_ptr, + sizeof(int) * batch_size, + dev_ctx.stream()); + for (int i = 0; i < batch_size; ++i) { + PADDLE_ENFORCE_EQ(info[i], + 0, + phi::errors::PreconditionNotMet( + "For batch [%d]: U(%d, %d) is zero, singular U. " + "Please check the matrix value and change it to a " + "non-singular matrix", + i, + info[i], + info[i])); + } +#else + ComputeInverseEigen(dev_ctx, a, a_inv); +#endif +} + +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +// TODO(chenweihang): remove these instantiations later +template class MatrixInverseFunctor; +template class MatrixInverseFunctor; + +} // namespace funcs +} // namespace phi diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/phi/kernels/funcs/matrix_inverse.h similarity index 61% rename from paddle/fluid/operators/math/matrix_inverse.h rename to paddle/phi/kernels/funcs/matrix_inverse.h index fb58b48366652..c5b04a8106561 100644 --- a/paddle/fluid/operators/math/matrix_inverse.h +++ b/paddle/phi/kernels/funcs/matrix_inverse.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,17 +17,18 @@ limitations under the License. */ #include #include "Eigen/Core" #include "Eigen/LU" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -namespace paddle { -namespace operators { -namespace math { +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/enforce.h" -template -void compute_inverse_eigen(const DeviceContext& context, - const framework::Tensor& a, - framework::Tensor* a_inv) { +namespace phi { +namespace funcs { + +template +void ComputeInverseEigen(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv) { using Matrix = Eigen::Matrix; using EigenMatrixMap = Eigen::Map; @@ -38,7 +39,7 @@ void compute_inverse_eigen(const DeviceContext& context, int batch_size = rank > 2 ? a.numel() / (n * n) : 1; const T* a_ptr = a.data(); - T* a_inv_ptr = a_inv->mutable_data(context.GetPlace()); + T* a_inv_ptr = a_inv->mutable_data(dev_ctx.GetPlace()); for (int i = 0; i < batch_size; ++i) { ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); @@ -47,20 +48,20 @@ void compute_inverse_eigen(const DeviceContext& context, lu.compute(mat); const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); - PADDLE_ENFORCE_GT( - min_abs_pivot, static_cast(0), - platform::errors::InvalidArgument("Input is not invertible.")); + PADDLE_ENFORCE_GT(min_abs_pivot, + static_cast(0), + errors::InvalidArgument("Input is not invertible.")); mat_inv.noalias() = lu.inverse(); } } -template +template class MatrixInverseFunctor { public: - void operator()(const DeviceContext& context, const framework::Tensor& a, - framework::Tensor* a_inv); + void operator()(const Context& dev_ctx, + const DenseTensor& a, + DenseTensor* a_inv); }; -} // namespace math -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi From 73583f862b7ac88328b201e5ac8d22bc4c122078 Mon Sep 17 00:00:00 2001 From: lilong12 Date: Tue, 8 Mar 2022 16:04:05 +0800 Subject: [PATCH 28/50] add the implementation of process group for hccl (#40228) * add pg_hccl --- .../distributed/collective/CMakeLists.txt | 3 + .../fluid/distributed/collective/HCCLTools.h | 174 +++++++++ .../collective/ProcessGroupHCCL.cc | 356 ++++++++++++++++++ .../distributed/collective/ProcessGroupHCCL.h | 152 ++++++++ .../fluid/platform/device/npu/hccl_helper.h | 17 + paddle/fluid/pybind/CMakeLists.txt | 3 + paddle/fluid/pybind/distributed_py.cc | 12 + .../tests/unittests/npu/process_group_hccl.py | 249 ++++++++++++ .../npu/test_collective_process_group_hccl.py | 29 ++ 9 files changed, 995 insertions(+) create mode 100644 paddle/fluid/distributed/collective/HCCLTools.h create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.cc create mode 100644 paddle/fluid/distributed/collective/ProcessGroupHCCL.h create mode 100644 python/paddle/fluid/tests/unittests/npu/process_group_hccl.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 96bc4a710f8c1..f88c993d85e2f 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 0000000000000..09789bd4d3786 --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 0000000000000..84f5ca48d25c8 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,356 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_hccl_blocking_wait) { + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 0000000000000..f2376b4eed760 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/platform/device/npu/hccl_helper.h b/paddle/fluid/platform/device/npu/hccl_helper.h index efbc56bee720b..134ec04030d75 100644 --- a/paddle/fluid/platform/device/npu/hccl_helper.h +++ b/paddle/fluid/platform/device/npu/hccl_helper.h @@ -53,6 +53,23 @@ inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { } } +inline HcclDataType ToHCCLDataType(experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return HCCL_DATA_TYPE_FP32; + } else if (type == experimental::DataType::FLOAT16) { + return HCCL_DATA_TYPE_FP16; + } else if (type == experimental::DataType::INT64) { + return HCCL_DATA_TYPE_INT64; + } else if (type == experimental::DataType::INT32) { + return HCCL_DATA_TYPE_INT32; + } else if (type == experimental::DataType::INT8) { + return HCCL_DATA_TYPE_INT8; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in hccl is not supported.")); + } +} + // NOTE(minqiyang): according to the ncclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, // ncclGroupEnd will wait for all communicators to be initialized, which will diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 7ff501ef43df7..f40cd51a7b286 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -88,6 +88,9 @@ if(NOT ON_INFER) if (WITH_GLOO) set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_gloo) endif() + if(WITH_ASCEND) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_hccl) + endif() set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) endif() diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc index 9870eab8da902..0b1796703817c 100644 --- a/paddle/fluid/pybind/distributed_py.cc +++ b/paddle/fluid/pybind/distributed_py.cc @@ -35,6 +35,10 @@ limitations under the License. */ #include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" #include "paddle/fluid/distributed/store/tcp_store.h" @@ -201,6 +205,14 @@ void BindDistributed(py::module *m) { py::call_guard()); #endif +#if defined(PADDLE_WITH_ASCEND_CL) + py::class_>( + *m, "ProcessGroupHCCL", ProcessGroup) + .def(py::init &, int, int>(), + py::call_guard()); +#endif + py::class_>(*m, "task") .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted) diff --git a/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py new file mode 100644 index 0000000000000..37a24885be1bf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/process_group_hccl.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + + +def init_process_group(strategy=None): + nranks = ParallelEnv().nranks + rank = ParallelEnv().local_rank + is_master = True if rank == 0 else False + store = paddle.fluid.core.TCPStore("127.0.0.1", 6173, is_master, nranks) + pg_group = core.ProcessGroupHCCL(store, rank, nranks) + + return pg_group + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float32" + self.shape = (2, 10, 5) + + def test_create_process_group_nccl(self): + with _test_eager_guard(): + paddle.set_device('npu:%d' % + paddle.distributed.ParallelEnv().dev_id) + + pg = init_process_group() + + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.allreduce(tensor_x) + task.wait() + assert np.array_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if pg.rank() == 0: + task = pg.broadcast(tensor_x, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_y) + + print("test broadcast api ok") + + # test barrier + # rank 0 + if pg.rank() == 0: + task = pg.barrier() + task.wait() + # rank 1 + else: + task = pg.barrier() + task.wait() + + print("test barrier api ok\n") + exit(0) + + # test allgather + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + out_shape = list(self.shape) + out_shape[0] *= 2 + out = np.random.random(out_shape).astype(self.dtype) + tensor_out = paddle.to_tensor(out) + if pg.rank() == 0: + task = pg.all_gather(tensor_x, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.all_gather(tensor_y, tensor_out) + task.wait() + paddle.device.cuda.synchronize() + out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) + out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], + [out_shape[0]]) + assert np.array_equal(tensor_x, out_1) + assert np.array_equal(tensor_y, out_2) + print("test allgather api ok\n") + + # test alltoall + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + out1 = np.random.random(self.shape).astype(self.dtype) + out2 = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + tensor_out1 = paddle.to_tensor(out1) + tensor_out2 = paddle.to_tensor(out2) + raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], + [self.shape[0]]) + raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], + [self.shape[0] // 2]) + if pg.rank() == 0: + task = pg.alltoall(tensor_x, tensor_out1) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.alltoall(tensor_y, tensor_out2) + task.wait() + paddle.device.cuda.synchronize() + out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], + [self.shape[0]]) + out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) + if pg.rank() == 0: + assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) + else: + assert np.array_equal(out2_1, raw_tensor_x_2) + print("test alltoall api ok\n") + + # test Reduce + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.reduce(tensor_x, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.reduce(tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + if pg.rank() == 0: + assert np.array_equal(tensor_x, sum_result) + print("test reduce sum api ok\n") + + # test Scatter + # rank 0 + in_shape = list(self.shape) + in_shape[0] *= 2 + x = np.random.random(in_shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + tensor_y = paddle.to_tensor(y) + if pg.rank() == 0: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + # rank 1 + else: + task = pg.scatter(tensor_x, tensor_y, 0) + task.wait() + paddle.device.cuda.synchronize() + out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) + out2 = paddle.slice(tensor_x, [0], [self.shape[0]], + [self.shape[0] * 2]) + if pg.rank() == 0: + assert np.array_equal(tensor_y, out1) + else: + assert np.array_equal(tensor_y, out2) + print("test scatter api ok\n") + + +class TestProcessGroupFp16(TestProcessGroupFp32): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float16" + self.shape = (4, 20, 20) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py new file mode 100644 index 0000000000000..9b2c6fae15eb4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_collective_process_group_hccl.py @@ -0,0 +1,29 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestProcessGroup(TestMultipleGpus): + def test_process_group_nccl(self): + self.run_mnist_2gpu('process_group_hccl.py') + + +if __name__ == "__main__": + unittest.main() From 00566eade8749566763af7e782224f3fed68bbdf Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Tue, 8 Mar 2022 16:47:20 +0800 Subject: [PATCH 29/50] Add exception throw for norm_conv when platform is not supported (#40166) * Add throw for norm_conv when platform is not supported * fix format --- .../operators/fused/cudnn_norm_conv_test.cc | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index b3792a176fabe..a80f590aa495d 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 From 48b4366c707ab570d7012e213d3eccef73ac40a4 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Tue, 8 Mar 2022 16:51:44 +0800 Subject: [PATCH 30/50] [Phi] move ops: maxout/take_along_axis/put_along_axis (#39959) * [Phi] move put_along_axis/take_along_axis/maxout * use phi::Copy --- paddle/fluid/operators/math/maxouting.cc | 151 +++++++++--------- paddle/fluid/operators/math/maxouting.cu | 107 +++++++------ paddle/fluid/operators/math/maxouting.h | 2 +- paddle/fluid/operators/maxout_op.cc | 13 +- paddle/fluid/operators/maxout_op.cu.cc | 24 --- paddle/fluid/operators/maxout_op.h | 72 --------- paddle/fluid/operators/put_along_axis_op.cc | 16 +- paddle/fluid/operators/put_along_axis_op.cu | 134 ---------------- paddle/fluid/operators/put_along_axis_op.h | 124 -------------- paddle/fluid/operators/take_along_axis_op.cc | 16 +- paddle/fluid/operators/take_along_axis_op.cu | 97 ----------- paddle/fluid/operators/take_along_axis_op.h | 92 ----------- paddle/phi/kernels/CMakeLists.txt | 8 +- paddle/phi/kernels/cpu/maxout_grad_kernel.cc | 20 +++ paddle/phi/kernels/cpu/maxout_kernel.cc | 19 +++ .../kernels/cpu/put_along_axis_grad_kernel.cc | 83 ++++++++++ .../phi/kernels/cpu/put_along_axis_kernel.cc | 87 ++++++++++ .../cpu/take_along_axis_grad_kernel.cc | 71 ++++++++ .../phi/kernels/cpu/take_along_axis_kernel.cc | 60 +++++++ paddle/phi/kernels/gpu/maxout_grad_kernel.cu | 20 +++ paddle/phi/kernels/gpu/maxout_kernel.cu | 19 +++ .../kernels/gpu/put_along_axis_grad_kernel.cu | 79 +++++++++ .../phi/kernels/gpu/put_along_axis_kernel.cu | 86 ++++++++++ .../gpu/take_along_axis_grad_kernel.cu | 72 +++++++++ .../phi/kernels/gpu/take_along_axis_kernel.cu | 59 +++++++ .../kernels/impl/maxout_grad_kernel_impl.h | 45 ++++++ paddle/phi/kernels/impl/maxout_kernel_impl.h | 37 +++++ paddle/phi/kernels/maxout_grad_kernel.h | 30 ++++ paddle/phi/kernels/maxout_kernel.h | 28 ++++ .../phi/kernels/put_along_axis_grad_kernel.h | 33 ++++ paddle/phi/kernels/put_along_axis_kernel.h | 32 ++++ .../phi/kernels/take_along_axis_grad_kernel.h | 29 ++++ paddle/phi/kernels/take_along_axis_kernel.h | 28 ++++ paddle/phi/ops/compat/maxout_sig.cc | 33 ++++ paddle/phi/ops/compat/put_along_axis_sig.cc | 38 +++++ paddle/phi/ops/compat/take_along_axis_sig.cc | 37 +++++ 36 files changed, 1191 insertions(+), 710 deletions(-) delete mode 100644 paddle/fluid/operators/maxout_op.cu.cc delete mode 100644 paddle/fluid/operators/maxout_op.h delete mode 100644 paddle/fluid/operators/put_along_axis_op.cu delete mode 100644 paddle/fluid/operators/put_along_axis_op.h delete mode 100644 paddle/fluid/operators/take_along_axis_op.cu delete mode 100644 paddle/fluid/operators/take_along_axis_op.h create mode 100644 paddle/phi/kernels/cpu/maxout_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/maxout_kernel.cc create mode 100644 paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/put_along_axis_kernel.cc create mode 100644 paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/take_along_axis_kernel.cc create mode 100644 paddle/phi/kernels/gpu/maxout_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/maxout_kernel.cu create mode 100644 paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/put_along_axis_kernel.cu create mode 100644 paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/take_along_axis_kernel.cu create mode 100644 paddle/phi/kernels/impl/maxout_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/maxout_kernel_impl.h create mode 100644 paddle/phi/kernels/maxout_grad_kernel.h create mode 100644 paddle/phi/kernels/maxout_kernel.h create mode 100644 paddle/phi/kernels/put_along_axis_grad_kernel.h create mode 100644 paddle/phi/kernels/put_along_axis_kernel.h create mode 100644 paddle/phi/kernels/take_along_axis_grad_kernel.h create mode 100644 paddle/phi/kernels/take_along_axis_kernel.h create mode 100644 paddle/phi/ops/compat/maxout_sig.cc create mode 100644 paddle/phi/ops/compat/put_along_axis_sig.cc create mode 100644 paddle/phi/ops/compat/take_along_axis_sig.cc diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d7a..28ec3a871022f 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48c7..1d0478db5ef4a 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2fe..1f4964f771542 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777de..e55369e0691ee 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.cu.cc b/paddle/fluid/operators/maxout_op.cu.cc deleted file mode 100644 index be1e81bb869a3..0000000000000 --- a/paddle/fluid/operators/maxout_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/maxout_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CUDA_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 922998293943e..0000000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bcae..54e31845ad4bd 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad2c..0000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28c9..0000000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915e4..fa8a5e92712ec 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b379..0000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2ad..0000000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 58ea231beef7c..de3b5b53f4640 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,11 +27,17 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel triangular_solve_grad_kernel maxout_kernel maxout_grad_kernel put_along_axis_kernel put_along_axis_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel) kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(triangular_solve_grad_kernel DEPS ${COMMON_KERNEL_DEPS} matrix_reduce) +kernel_library(maxout_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(maxout_grad_kernel DEPS ${COMMON_KERNEL_DEPS} maxouting) +kernel_library(put_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(put_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) +kernel_library(take_along_axis_grad_kernel DEPS ${COMMON_KERNEL_DEPS} gather_scatter_kernel) # 4. auto parse and build kernel targets by cmake register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS} ${COMMON_BAISC_KERNELS} ) diff --git a/paddle/phi/kernels/cpu/maxout_grad_kernel.cc b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc new file mode 100644 index 0000000000000..429344a362b1c --- /dev/null +++ b/paddle/phi/kernels/cpu/maxout_grad_kernel.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + maxout_grad, CPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/maxout_kernel.cc b/paddle/phi/kernels/cpu/maxout_kernel.cc new file mode 100644 index 0000000000000..e7cd3ab07ff59 --- /dev/null +++ b/paddle/phi/kernels/cpu/maxout_kernel.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(maxout, CPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc new file mode 100644 index 0000000000000..e94d09e0337f2 --- /dev/null +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("PutAlongAxisGradOpKernel only runs on CPU.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_input_grad_kernel( + // Here passing an unused argument out_grad, because it's + // convenient to instantiate a bunch of template function with the + // same arguments list. + out_grad, + axis, + index, + *x_grad, + dev_ctx); + } else { + paddle::operators::cpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } + } + + if (value_grad) { + value_grad->Resize(index.dims()); + value_grad->mutable_data(dev_ctx.GetPlace()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis_grad, + CPU, + ALL_LAYOUT, + phi::PutAlongAxisGradKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc new file mode 100644 index 0000000000000..83c9a915ee635 --- /dev/null +++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("PutAlongAxisOpKernel only runs on CPU.")); + + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (reduce == "add") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "assign") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } + } else { + PADDLE_THROW(errors::InvalidArgument( + "can not support reduce: '%s' for scatter kernel, only " + "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "defalut reduce " + "op is 'assign' ", + reduce)); + return; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis, + CPU, + ALL_LAYOUT, + phi::PutAlongAxisKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc new file mode 100644 index 0000000000000..4443383f40262 --- /dev/null +++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_grad_kernel.h" + +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on CPU.")); + + // We need to know the shape of input matrix to determine the shape of grad + // matrix of input. + x_grad->Resize(x.dims()); + dev_ctx.template Alloc(x_grad); + + // Set to zero tensor. + phi::funcs::SetConstant functor; + functor(dev_ctx, x_grad, static_cast(0)); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_scatter_add_kernel( + *x_grad, + axis, + index, + out_grad, + dev_ctx); // the gradient of gather is scatter + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_scatter_add_kernel( + *x_grad, axis, index, out_grad, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis_grad, + CPU, + ALL_LAYOUT, + phi::TakeAlongAxisGradKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc new file mode 100644 index 0000000000000..502db8a22da0b --- /dev/null +++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_cpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on CPU.")); + + out->Resize(index.dims()); + dev_ctx.template Alloc(out); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::cpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::cpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis, + CPU, + ALL_LAYOUT, + phi::TakeAlongAxisKernel, + float, + double, + int, + uint8_t, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/maxout_grad_kernel.cu b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu new file mode 100644 index 0000000000000..86ff09fd74b06 --- /dev/null +++ b/paddle/phi/kernels/gpu/maxout_grad_kernel.cu @@ -0,0 +1,20 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + maxout_grad, GPU, ALL_LAYOUT, phi::MaxOutGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/maxout_kernel.cu b/paddle/phi/kernels/gpu/maxout_kernel.cu new file mode 100644 index 0000000000000..88776a49f19b2 --- /dev/null +++ b/paddle/phi/kernels/gpu/maxout_kernel.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/maxout_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(maxout, GPU, ALL_LAYOUT, phi::MaxOutKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu new file mode 100644 index 0000000000000..f553da361f1fe --- /dev/null +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -0,0 +1,79 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad) { + PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet( + "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (x_grad) { + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } else { + paddle::operators::gpu_scatter_input_grad_kernel( + out_grad, axis, index, *x_grad, dev_ctx); + } + } + if (value_grad) { + value_grad->Resize(index.dims()); + value_grad->mutable_data(dev_ctx.GetPlace()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_gather_kernel( + out_grad, + axis, + index, + *value_grad, + dev_ctx); // the gradient of scatter is gather + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_gather_kernel( + out_grad, axis, index, *value_grad, dev_ctx); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis_grad, + GPU, + ALL_LAYOUT, + phi::PutAlongAxisGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu new file mode 100644 index 0000000000000..d363c0c28364c --- /dev/null +++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu @@ -0,0 +1,86 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/put_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out) { + PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet( + "PutAlongAxisCUDAKernel only runs on GPU device.")); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + if (reduce == "add") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_add_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "multiply" || reduce == "mul") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_mul_kernel( + *out, axis, index, value, dev_ctx); + } + } else if (reduce == "assign") { + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_assign_kernel( + *out, axis, index, value, dev_ctx); + } + } else { + PADDLE_THROW(errors::InvalidArgument( + "can not support reduce: '%s' for scatter kernel, only " + "support reduce op: 'add', 'assign', 'mul' and 'multiply', the " + "defalut reduce op is 'assign' ", + reduce)); + return; + } +} +} // namespace phi + +PD_REGISTER_KERNEL(put_along_axis, + GPU, + ALL_LAYOUT, + phi::PutAlongAxisKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu new file mode 100644 index 0000000000000..e09cfd370a4f0 --- /dev/null +++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_grad_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on GPU.")); + + // We need to know the shape of input matrix to determine the shape of grad + // matrix of input. + x_grad->Resize(x.dims()); + dev_ctx.template Alloc(x_grad); + + // Set to zero tensor. + phi::funcs::SetConstant functor; + functor(dev_ctx, x_grad, static_cast(0)); + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_scatter_add_kernel( + *x_grad, + axis, + index, + out_grad, + dev_ctx); // the gradient of gather is scatter + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_scatter_add_kernel( + *x_grad, axis, index, out_grad, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis_grad, + GPU, + ALL_LAYOUT, + phi::TakeAlongAxisGradKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu new file mode 100644 index 0000000000000..63113e3e672f3 --- /dev/null +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/take_along_axis_kernel.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/operators/gather_scatter_kernel.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out) { + PADDLE_ENFORCE_EQ( + paddle::platform::is_gpu_place(dev_ctx.GetPlace()), + true, + errors::PreconditionNotMet("This kernel only runs on GPU device.")); + + out->Resize(index.dims()); + dev_ctx.template Alloc(out); + + const auto& index_type = + paddle::framework::TransToProtoVarType(index.dtype()); + if (index_type == paddle::framework::proto::VarType::INT32) { + paddle::operators::gpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } else if (index_type == paddle::framework::proto::VarType::INT64) { + paddle::operators::gpu_gather_kernel( + x, axis, index, *out, dev_ctx); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(take_along_axis, + GPU, + ALL_LAYOUT, + phi::TakeAlongAxisKernel, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h new file mode 100644 index 0000000000000..546ea74674281 --- /dev/null +++ b/paddle/phi/kernels/impl/maxout_grad_kernel_impl.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/maxout_grad_kernel.h" + +#include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void MaxOutGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + int groups, + int axis, + DenseTensor* x_grad) { + if (axis < 0) { + axis += x.dims().size(); + } + + phi::funcs::SetConstant zero; + if (x_grad) { + dev_ctx.template Alloc(x_grad); + zero(dev_ctx, x_grad, static_cast(0.0)); + paddle::operators::math::MaxOutGradFunctor maxout_backward; + maxout_backward(dev_ctx, x, x_grad, out, out_grad, groups, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/maxout_kernel_impl.h b/paddle/phi/kernels/impl/maxout_kernel_impl.h new file mode 100644 index 0000000000000..da8c259ebf217 --- /dev/null +++ b/paddle/phi/kernels/impl/maxout_kernel_impl.h @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/maxout_kernel.h" + +#include "paddle/fluid/operators/math/maxouting.h" + +namespace phi { + +template +void MaxOutKernel(const Context& dev_ctx, + const DenseTensor& x, + int groups, + int axis, + DenseTensor* out) { + if (axis < 0) { + axis += x.dims().size(); + } + + paddle::operators::math::MaxOutFunctor maxout_forward; + maxout_forward(dev_ctx, x, out, groups, axis); +} + +} // namespace phi diff --git a/paddle/phi/kernels/maxout_grad_kernel.h b/paddle/phi/kernels/maxout_grad_kernel.h new file mode 100644 index 0000000000000..1ee4e8cc89676 --- /dev/null +++ b/paddle/phi/kernels/maxout_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MaxOutGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + int groups, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/maxout_kernel.h b/paddle/phi/kernels/maxout_kernel.h new file mode 100644 index 0000000000000..e582575678d4d --- /dev/null +++ b/paddle/phi/kernels/maxout_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MaxOutKernel(const Context& dev_ctx, + const DenseTensor& x, + int groups, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h new file mode 100644 index 0000000000000..2141443da7ab1 --- /dev/null +++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PutAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + const std::string& reduce, + DenseTensor* x_grad, + DenseTensor* value_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h new file mode 100644 index 0000000000000..797d0e364b48d --- /dev/null +++ b/paddle/phi/kernels/put_along_axis_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PutAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& value, + int axis, + const std::string& reduce, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/take_along_axis_grad_kernel.h b/paddle/phi/kernels/take_along_axis_grad_kernel.h new file mode 100644 index 0000000000000..a312c235f66fc --- /dev/null +++ b/paddle/phi/kernels/take_along_axis_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TakeAlongAxisGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/take_along_axis_kernel.h b/paddle/phi/kernels/take_along_axis_kernel.h new file mode 100644 index 0000000000000..e8fb78556d9bb --- /dev/null +++ b/paddle/phi/kernels/take_along_axis_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void TakeAlongAxisKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& index, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/maxout_sig.cc b/paddle/phi/ops/compat/maxout_sig.cc new file mode 100644 index 0000000000000..d16dd1c8617fe --- /dev/null +++ b/paddle/phi/ops/compat/maxout_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MaxoutArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("maxout", {"X"}, {"groups", "axis"}, {"Out"}); +} + +KernelSignature MaxoutGradArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("maxout_grad", + {"X", "Out", GradVarName("Out")}, + {"groups", "axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(maxout, phi::MaxoutArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(maxout_grad, phi::MaxoutGradArgumentMapping); diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc new file mode 100644 index 0000000000000..5f8dc1cf4cd71 --- /dev/null +++ b/paddle/phi/ops/compat/put_along_axis_sig.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("put_along_axis", + {"Input", "Index", "Value"}, + {"Axis", "Reduce"}, + {"Result"}); +} + +KernelSignature PutAlongAxisGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("put_along_axis_grad", + {"Input", "Index", GradVarName("Result")}, + {"Axis", "Reduce"}, + {GradVarName("Input"), GradVarName("Value")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad, + phi::PutAlongAxisGradArgumentMapping); diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc new file mode 100644 index 0000000000000..27a996a270ddf --- /dev/null +++ b/paddle/phi/ops/compat/take_along_axis_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TakeAlongAxisArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"}); +} + +KernelSignature TakeAlongAxisGradArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("take_along_axis_grad", + {"Input", "Index", GradVarName("Result")}, + {"Axis"}, + {GradVarName("Input")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad, + phi::TakeAlongAxisGradArgumentMapping); From d4a4eb9d68d1d6ca8025fefbfee1dfb98a9170d0 Mon Sep 17 00:00:00 2001 From: xiaoting <31891223+tink2123@users.noreply.github.com> Date: Tue, 8 Mar 2022 17:05:50 +0800 Subject: [PATCH 31/50] Fix fold python examples (#38636) * fix fold python examples, test=develop * fix size type, test=develop * fix python example, test=develop * fix fold shape check * fix fold dygraph mode, test=develop --- paddle/fluid/operators/fold_op.cc | 22 +++++- .../fluid/tests/unittests/test_fold_op.py | 10 +++ python/paddle/nn/functional/common.py | 68 +++++++++++-------- python/paddle/nn/layer/common.py | 15 ++-- 4 files changed, 76 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190ff..92f59e118c3b7 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py index 14a59b413383f..44b94cd3b66ee 100644 --- a/python/paddle/fluid/tests/unittests/test_fold_op.py +++ b/python/paddle/fluid/tests/unittests/test_fold_op.py @@ -174,6 +174,15 @@ def test_output_size(): x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]) + def test_output_size_2(): + # out_size must GT 1 + x = paddle.randn(shape=[2, 6, 6], dtype="float32") + out = fold( + x, + output_sizes=[0.1, 0.2], + kernel_sizes=[2, 2], + strides=[1, 1]) + def test_block_h_w(): # test_block_h_w GT 0 x = paddle.randn(shape=[2, 1, 1], dtype="float32") @@ -196,6 +205,7 @@ def test_GT_0(): self.assertRaises(AssertionError, test_dilations_shape) self.assertRaises(AssertionError, test_strides_shape) self.assertRaises(ValueError, test_output_size) + self.assertRaises(ValueError, test_output_size_2) self.assertRaises(ValueError, test_block_h_w) self.assertRaises(ValueError, test_GT_0) diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py index ed668ed124c23..9e78ca6be3f27 100644 --- a/python/paddle/nn/functional/common.py +++ b/python/paddle/nn/functional/common.py @@ -351,7 +351,6 @@ def _is_list_or_turple_(data): out_shape = size scale = scale_factor - if out_shape is not None and scale is not None: raise ValueError("Only one of size or scale_factor should be defined.") if out_shape is not None: @@ -362,6 +361,8 @@ def _is_list_or_turple_(data): if in_dynamic_mode(): if isinstance(out_shape, Variable): out_shape = list(out_shape.numpy()) + else: + out_shape = list(out_shape) for i, dim in enumerate(out_shape): if isinstance(dim, Variable): out_shape[i] = dim.numpy()[0] @@ -1818,7 +1819,6 @@ def fold(x, can be calculated as following. .. math:: - H_out &= output_size[0] W_out &= output_size[1] C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1] @@ -1826,21 +1826,21 @@ def fold(x, Parameters: x(Tensor): 3-D Tensor, input tensor of format [N, C, L], data type can be float32 or float64 - output_sizes(list): The size of output size, should be [output_size_h, output_size_w] + output_sizes(int|list|tuple): The size of output size, should be [output_size_h, output_size_w] or an interger o treated as [o, o]. - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list): The strides, should be [stride_h, stride_w] + strides(int|list|tuple): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list): The paddings of each dimension, should be + paddings(int|list|tuple): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0] - dilations(int|list): the dilations of convolution kernel, should be + dilations(int|list|tuple): the dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. @@ -1859,9 +1859,9 @@ def fold(x, import paddle import paddle.nn.functional as F - x = paddle.randn([2,12,9]) - y = F.fold(x, output_sizes=(4, 4), kernel_sizes=2) - # y.shape = [2,3,4,4] + x = paddle.randn([2,3*2*2,12]) + y = F.fold(x, output_sizes=[4, 5], kernel_sizes=2) + # y.shape = [2,3,4,5] """ @@ -1872,29 +1872,32 @@ def fold(x, assert len(x.shape) == 3, \ "input should be the format of [N, C, L]" + def _is_list_or_turple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + if isinstance(output_sizes, int): output_sizes = [output_sizes, output_sizes] else: - assert isinstance(output_sizes, list) and (len(output_sizes) == 2), \ - "output_sizes should either be an integer or a list of two integers" + assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \ + "output_sizes should either be an integer or a list/tuple of two integers" if isinstance(kernel_sizes, int): kernel_sizes = [kernel_sizes, kernel_sizes] else: - assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \ - "kernel_sizes should either be an integer or a list of two integers" + assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \ + "kernel_sizes should either be an integer or a list/tuple of two integers" if isinstance(strides, int): strides = [strides, strides] else: - assert isinstance(strides, list) and (len(strides) == 2), \ - "strides should either be an integer or a list of two integers" + assert _is_list_or_turple_(strides) and (len(strides) == 2), \ + "strides should either be an integer or a list/tuple of two integers" if isinstance(dilations, int): dilations = [dilations, dilations] else: - assert isinstance(dilations, list) and (len(dilations) == 2), \ - "dilations should either be an integer or a list of two integers" + assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \ + "dilations should either be an integer or a list/tuple of two integers" if isinstance(paddings, int): paddings = [paddings] * 4 @@ -1912,16 +1915,21 @@ def fold(x, "Unexpected type of paddings, it should be either an integer or a list" "of 2 or 4 integers") - out = helper.create_variable_for_type_inference(dtype=x.dtype) - helper.append_op( - type="fold", - inputs={"X": x}, - outputs={"Y": out}, - attrs={ - "output_sizes": output_sizes, - "kernel_sizes": kernel_sizes, - "strides": strides, - "paddings": paddings, - "dilations": dilations - }) + if in_dynamic_mode(): + out = _C_ops.fold(x, "output_sizes", output_sizes, "kernel_sizes", + kernel_sizes, "strides", strides, "paddings", + paddings, "dilations", dilations) + else: + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="fold", + inputs={"X": x}, + outputs={"Y": out}, + attrs={ + "output_sizes": output_sizes, + "kernel_sizes": kernel_sizes, + "strides": strides, + "paddings": paddings, + "dilations": dilations + }) return out diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 19fbcd5b6f856..dac4cf5f27253 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1565,7 +1565,6 @@ class Fold(Layer): can be calculated as following. .. math:: - H_out &= output_size[0] W_out &= output_size[1] C_out &= C_in / kernel\_sizes[0] / kernel\_sizes[1] @@ -1573,19 +1572,19 @@ class Fold(Layer): Parameters: output_sizes(list): The size of output size, should be [output_size_h, output_size_w] or an interger o treated as [o, o]. - kernel_sizes(int|list): The size of convolution kernel, should be [k_h, k_w] + kernel_sizes(int|list|tuple): The size of convolution kernel, should be [k_h, k_w] or an integer k treated as [k, k]. - strides(int|list): The strides, should be [stride_h, stride_w] + strides(int|list|tuple): The strides, should be [stride_h, stride_w] or an integer stride treated as [sride, stride]. For default, strides will be [1, 1]. - paddings(int|list): The paddings of each dimension, should be + paddings(int|list|tuple): The paddings of each dimension, should be [padding_top, padding_left, padding_bottom, padding_right] or [padding_h, padding_w] or an integer padding. If [padding_h, padding_w] was given, it will expanded to [padding_h, padding_w, padding_h, padding_w]. If an integer padding was given, [padding, padding, padding, padding] will be used. For default, paddings will be [0, 0, 0, 0] - dilations(int|list): the dilations of convolution kernel, should be + dilations(int|list|tuple): the dilations of convolution kernel, should be [dilation_h, dilation_w], or an integer dilation treated as [dilation, dilation]. For default, it will be [1, 1]. name(str, optional): The default value is None. @@ -1604,10 +1603,10 @@ class Fold(Layer): import paddle import paddle.nn as nn - x = paddle.randn([2,12,9]) - fold = nn.Fold(output_sizes=(4, 4), kernel_sizes=2) + x = paddle.randn([2,3*2*2,12]) + fold = nn.Fold(output_sizes=[4, 5], kernel_sizes=2) y = fold(x) - # y.shape = [2,3,4,4] + # y.shape = [2,3,4,5] """ def __init__(self, From 2ce007cae0a2307997d8ffc43292fd505246e36b Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 8 Mar 2022 17:41:29 +0800 Subject: [PATCH 32/50] remove isinstance Dataset check. test=develop (#40184) --- python/paddle/fluid/dataloader/batch_sampler.py | 2 -- python/paddle/fluid/reader.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/python/paddle/fluid/dataloader/batch_sampler.py b/python/paddle/fluid/dataloader/batch_sampler.py index 3debeecfe4f38..3a23c852563da 100644 --- a/python/paddle/fluid/dataloader/batch_sampler.py +++ b/python/paddle/fluid/dataloader/batch_sampler.py @@ -113,8 +113,6 @@ def __init__(self, assert not shuffle, "shuffle should be False when sampler is set" self.sampler = sampler else: - assert isinstance(dataset, Dataset), \ - "dataset should be a paddle.io.Dataset" assert not isinstance(dataset, IterableDataset), \ "dataset should not be a paddle.io.IterableDataset" assert sampler is None, \ diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index 727ceca72d1f1..cbea289162c84 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -332,8 +332,6 @@ def __init__(self, self.use_buffer_reader = use_buffer_reader self.worker_init_fn = worker_init_fn - assert isinstance(dataset, Dataset), \ - "dataset should be subclass instance of paddle.io.Dataset" self.dataset = dataset if not return_list and not in_dygraph_mode(): From 9aa6bfc7e1cfce657109789995d153b6bcdf74d7 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Tue, 8 Mar 2022 17:42:29 +0800 Subject: [PATCH 33/50] fix yolov3 return value in dygraph mode. test=develop (#40185) --- python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py | 1 + python/paddle/vision/ops.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 1ec1d1527e178..3f0e4f7a4002a 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -305,6 +305,7 @@ def test_dygraph(self): use_label_smooth=True, scale_x_y=1.) assert loss is not None + assert loss.shape == [2] paddle.enable_static() diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 03060e92bdb69..4983ca49ac32f 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -195,7 +195,7 @@ def yolo_loss(x, """ if in_dygraph_mode() and gt_score is None: - loss = _C_ops.yolov3_loss( + loss, _, _ = _C_ops.yolov3_loss( x, gt_box, gt_label, 'anchors', anchors, 'anchor_mask', anchor_mask, 'class_num', class_num, 'ignore_thresh', ignore_thresh, 'downsample_ratio', downsample_ratio, 'use_label_smooth', From 3a77d027b143b19a9c26bdc7e77e0902ff2a7feb Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 8 Mar 2022 20:18:03 +0800 Subject: [PATCH 34/50] [Phi] Remove gpudnn suffix & polish cmake (#40239) * remove gpudnn suffix & polish cmake * fix typo --- cmake/phi.cmake | 122 +++++++++--------- ...nel_gpudnn.cu => conv_grad_grad_kernel.cu} | 0 ...d_kernel_gpudnn.cu => conv_grad_kernel.cu} | 0 .../{conv_kernel_gpudnn.cu => conv_kernel.cu} | 0 ...ernel_gpudnn.cu => softmax_grad_kernel.cu} | 0 ...max_kernel_gpudnn.cu => softmax_kernel.cu} | 0 6 files changed, 59 insertions(+), 63 deletions(-) rename paddle/phi/kernels/gpudnn/{conv_grad_grad_kernel_gpudnn.cu => conv_grad_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{conv_grad_kernel_gpudnn.cu => conv_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{conv_kernel_gpudnn.cu => conv_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{softmax_grad_kernel_gpudnn.cu => softmax_grad_kernel.cu} (100%) rename paddle/phi/kernels/gpudnn/{softmax_kernel_gpudnn.cu => softmax_kernel.cu} (100%) diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379ad..ebb686d8ad0f3 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/conv_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/conv_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu similarity index 100% rename from paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu rename to paddle/phi/kernels/gpudnn/softmax_kernel.cu From c1d81ec13cec96729f3902455e1038eb6e6280cf Mon Sep 17 00:00:00 2001 From: chenjian Date: Tue, 8 Mar 2022 21:12:04 +0800 Subject: [PATCH 35/50] Add profiler statistic (#40249) * add python profiler package * update according to review * fix bug * fix bug * fix bug * add unit test * Revert "add unit test" This reverts commit 4e69ff71b0645e069afe5dd8fea0d07717852c48. * reduce for pr * add unit test * modify for pr * fix unittest * update for ci coverage * modify according to review * fix bug * improve coverage * add profiler code * add statistic code * reduce content for pr --- .../unittests/test_profiler_statistic.py | 199 +++++ python/paddle/profiler/profiler_statistic.py | 793 ++++++++++++++++++ 2 files changed, 992 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_profiler_statistic.py mode change 100644 => 100755 python/paddle/profiler/profiler_statistic.py diff --git a/python/paddle/fluid/tests/unittests/test_profiler_statistic.py b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py new file mode 100644 index 0000000000000..838ccae37cfa5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_profiler_statistic.py @@ -0,0 +1,199 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +import paddle.profiler as profiler + + +class HostPythonNode: + def __init__(self, name, type, start_ns, end_ns, process_id, thread_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.process_id = process_id + self.thread_id = thread_id + self.children_node = [] + self.runtime_node = [] + self.device_node = [] + + +class DevicePythonNode: + def __init__(self, name, type, start_ns, end_ns, device_id, context_id, + stream_id): + self.name = name + self.type = type + self.start_ns = start_ns + self.end_ns = end_ns + self.device_id = device_id + self.context_id = context_id + self.stream_id = stream_id + + +class TestProfilerStatistic(unittest.TestCase): + def test_statistic_case1(self): + root_node = HostPythonNode('Root Node', + profiler.TracerEventType.UserDefined, 0, + float('inf'), 1000, 1001) + profilerstep_node = HostPythonNode('ProfileStep#1', + profiler.TracerEventType.ProfileStep, + 0, 400, 1000, 1001) + dataloader_node = HostPythonNode( + 'Dataloader', profiler.TracerEventType.Forward, 5, 15, 1000, 1001) + mobilenet_node = HostPythonNode( + 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) + yolonet_node = HostPythonNode( + 'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001) + backward_node = HostPythonNode('Gradient Backward', + profiler.TracerEventType.Backward, 120, + 200, 1000, 1001) + optimization_node = HostPythonNode( + 'Optimization', profiler.TracerEventType.Optimization, 220, 300, + 1000, 1001) + conv2d_node = HostPythonNode( + 'conv2d', profiler.TracerEventType.Operator, 25, 40, 1000, 1001) + sync_batch_norm_node = HostPythonNode('sync_batch_norm', + profiler.TracerEventType.Operator, + 60, 100, 1000, 1001) + conv2d_infer_shape = HostPythonNode( + 'conv2d::infer_shape', profiler.TracerEventType.OperatorInner, 25, + 30, 1000, 1001) + conv2d_compute = HostPythonNode('conv2d::compute', + profiler.TracerEventType.OperatorInner, + 30, 40, 1000, 1001) + conv2d_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 30, 35, + 1000, 1001) + conv2d_MemCpy = HostPythonNode('AsyncMemcpy', + profiler.TracerEventType.UserDefined, 35, + 40, 1000, 1001) + conv2d_cudaMemCpy = HostPythonNode('cudaMemcpy', + profiler.TracerEventType.CudaRuntime, + 35, 40, 1000, 1001) + conv2d_kernel = DevicePythonNode( + 'conv2d_kernel', profiler.TracerEventType.Kernel, 35, 50, 0, 0, 0) + conv2d_memcpy = DevicePythonNode( + 'conv2d_memcpy', profiler.TracerEventType.Memcpy, 50, 60, 0, 0, 0) + sync_batch_norm_infer_shape = HostPythonNode( + 'sync_batch_norm::infer_shape', + profiler.TracerEventType.OperatorInner, 60, 70, 1000, 1001) + sync_batch_norm_compute = HostPythonNode( + 'sync_batch_norm::compute', profiler.TracerEventType.OperatorInner, + 80, 100, 1000, 1001) + sync_batch_norm_launchkernel = HostPythonNode( + 'cudalaunchkernel', profiler.TracerEventType.CudaRuntime, 80, 90, + 1000, 1001) + sync_batch_norm_MemCpy = HostPythonNode( + 'AsyncMemcpy', profiler.TracerEventType.UserDefined, 90, 100, 1000, + 1001) + sync_batch_norm_cudaMemCpy = HostPythonNode( + 'cudaMemcpy', profiler.TracerEventType.CudaRuntime, 90, 100, 1000, + 1001) + sync_batch_norm_kernel = DevicePythonNode( + 'sync_batch_norm_kernel', profiler.TracerEventType.Kernel, 95, 155, + 0, 0, 0) + sync_batch_norm_memcpy = DevicePythonNode( + 'sync_batch_norm_memcpy', profiler.TracerEventType.Memcpy, 150, 200, + 0, 0, 1) + root_node.children_node.append(profilerstep_node) + profilerstep_node.children_node.extend([ + dataloader_node, mobilenet_node, yolonet_node, backward_node, + optimization_node + ]) + mobilenet_node.children_node.append(conv2d_node) + yolonet_node.children_node.append(sync_batch_norm_node) + conv2d_node.children_node.extend( + [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) + conv2d_compute.runtime_node.append(conv2d_launchkernel) + conv2d_MemCpy.runtime_node.append(conv2d_cudaMemCpy) + conv2d_launchkernel.device_node.append(conv2d_kernel) + conv2d_cudaMemCpy.device_node.append(conv2d_memcpy) + sync_batch_norm_node.children_node.extend([ + sync_batch_norm_infer_shape, sync_batch_norm_compute, + sync_batch_norm_MemCpy + ]) + sync_batch_norm_compute.runtime_node.append( + sync_batch_norm_launchkernel) + sync_batch_norm_MemCpy.runtime_node.append(sync_batch_norm_cudaMemCpy) + sync_batch_norm_launchkernel.device_node.append(sync_batch_norm_kernel) + sync_batch_norm_cudaMemCpy.device_node.append(sync_batch_norm_memcpy) + thread_tree = {'thread1001': root_node} + extra_info = { + 'Process Cpu Utilization': '1.02', + 'System Cpu Utilization': '0.68' + } + statistic_data = profiler.profiler_statistic.StatisticData(thread_tree, + extra_info) + time_range_summary = statistic_data.time_range_summary + event_summary = statistic_data.event_summary + + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.ProfileStep), 400) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Forward), 90) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Backward), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Optimization), 80) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.Operator), 55) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.OperatorInner), 45) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.CudaRuntime), 30) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Kernel), 75) + self.assertEqual( + time_range_summary.get_gpu_range_sum( + 0, profiler.TracerEventType.Memcpy), 60) + self.assertEqual( + time_range_summary.get_cpu_range_sum( + profiler.TracerEventType.UserDefined), 15) + self.assertEqual(len(event_summary.items), 2) + self.assertEqual(len(event_summary.userdefined_items), 0) + self.assertEqual(len(event_summary.model_perspective_items), 3) + self.assertEqual(len(event_summary.memory_manipulation_items), 1) + self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) + self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) + self.assertEqual( + event_summary.model_perspective_items['Forward'].cpu_time, 90) + self.assertEqual( + event_summary.model_perspective_items['Forward'].gpu_time, 135) + self.assertEqual( + event_summary.model_perspective_items['Backward'].gpu_time, 0) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].cpu_time, 15) + self.assertEqual( + event_summary.memory_manipulation_items['AsyncMemcpy'].gpu_time, 60) + print( + profiler.profiler_statistic._build_table( + statistic_data, + sorted_by=profiler.SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms')) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py old mode 100644 new mode 100755 index 29d586268a014..e39871c7365ba --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -16,6 +16,20 @@ from paddle.fluid.core import TracerEventType +from .statistic_helper import * + +_AllTracerEventType = [ + TracerEventType.Operator, TracerEventType.Dataloader, + TracerEventType.ProfileStep, TracerEventType.CudaRuntime, + TracerEventType.Kernel, TracerEventType.Memcpy, TracerEventType.Memset, + TracerEventType.UserDefined, TracerEventType.OperatorInner, + TracerEventType.Forward, TracerEventType.Backward, + TracerEventType.Optimization, TracerEventType.Communication, + TracerEventType.PythonOp, TracerEventType.PythonUserDefined +] + +_CommunicationOpName = ['reduce', 'broadcast', 'rpc'] + class SortedKeys(Enum): r""" @@ -29,3 +43,782 @@ class SortedKeys(Enum): GPUAvg = 5 GPUMax = 6 GPUMin = 7 + + +class HostStatisticNode: + r''' + Wrap original node for calculating statistic metrics. + ''' + + def __init__(self, hostnode): + self.hostnode = hostnode + self.children_node = [] + self.runtime_node = [] + self.cpu_time = 0 + self.self_cpu_time = 0 + self.gpu_time = 0 + self.self_gpu_time = 0 + + def cal_statistic(self): + for child in self.children_node: + child.cal_statistic() + for rt in self.runtime_node: + rt.cal_statistic() + + self.cpu_time = self.hostnode.end_ns - self.hostnode.start_ns + for child in self.children_node: + self.gpu_time += child.gpu_time + self.self_cpu_time -= (child.end_ns - child.start_ns) + for rt in self.runtime_node: + self.self_cpu_time -= (rt.end_ns - rt.start_ns) + self.gpu_time += rt.gpu_time + self.self_gpu_time += rt.gpu_time + for device in self.hostnode.device_node: + self.gpu_time += (device.end_ns - device.start_ns) + self.self_gpu_time += (device.end_ns - device.start_ns) + + @property + def end_ns(self): + return self.hostnode.end_ns + + @property + def start_ns(self): + return self.hostnode.start_ns + + def __getattr__(self, name): + return getattr(self.hostnode, name) + + +def traverse_tree(nodetrees): + results = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + threadlist = results[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + for childnode in current_node.children_node: + stack.append(childnode) + return results + + +def wrap_tree(nodetrees): + ''' + Using HostStatisticNode to wrap original profiler result tree, and calculate node statistic metrics. + ''' + node_statistic_tree = {} + results = collections.defaultdict(list) + newresults = collections.defaultdict(list) + for thread_id, rootnode in nodetrees.items(): + stack = [] + stack.append(rootnode) + root_statistic_node = HostStatisticNode(rootnode) + newstack = [] + newstack.append(root_statistic_node) + node_statistic_tree[thread_id] = root_statistic_node + threadlist = results[thread_id] + newthreadlist = newresults[thread_id] + while stack: + current_node = stack.pop() + threadlist.append(current_node) + current_statistic_node = newstack.pop() + newthreadlist.append(current_statistic_node) + for childnode in current_node.children_node: + stack.append(childnode) + child_statistic_node = HostStatisticNode(childnode) + current_statistic_node.children_node.append( + child_statistic_node) + newstack.append(child_statistic_node) + for runtimenode in current_node.runtime_node: + runtime_statistic_node = HostStatisticNode(runtimenode) + current_statistic_node.runtime_node.append( + runtime_statistic_node) + # recursive calculate node statistic values + for thread_id, root_statistic_node in node_statistic_tree.items(): + root_statistic_node.cal_statistic() + + return node_statistic_tree, newresults + + +class TimeRangeSummary: + r""" + Analyse time ranges for each TracerEventType, and summarize the time. + """ + + def __init__(self): + self.CPUTimeRange = collections.defaultdict(list) + self.GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(list) + ) # GPU events should be divided into different devices + self.CPUTimeRangeSum = collections.defaultdict(int) + self.GPUTimeRangeSum = collections.defaultdict( + lambda: collections.defaultdict(int)) + self.call_times = collections.defaultdict(int) + + def parse(self, nodetrees): + r""" + Analysis node trees in profiler result, and get time range for different tracer event type. + """ + thread2hostnodes = traverse_tree(nodetrees) + for threadid, hostnodes in thread2hostnodes.items(): + CPUTimeRange = collections.defaultdict(list) + GPUTimeRange = collections.defaultdict( + lambda: collections.defaultdict(lambda: collections.defaultdict(list)) + ) # device_id/type/stream_id + for hostnode in hostnodes[1:]: #skip root node + CPUTimeRange[hostnode.type].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[hostnode.type] += 1 + if hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in + _CommunicationOpName]): # special case, communication op + CPUTimeRange[TracerEventType.Communication].append( + (hostnode.start_ns, hostnode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + is_communication_node = ( + hostnode.type == TracerEventType.Communication + ) or (hostnode.type == TracerEventType.Operator and any( + [name in hostnode.name for name in _CommunicationOpName])) + for runtimenode in hostnode.runtime_node: + CPUTimeRange[runtimenode.type].append( + (runtimenode.start_ns, runtimenode.end_ns)) + self.call_times[runtimenode.type] += 1 + for devicenode in runtimenode.device_node: + GPUTimeRange[devicenode.device_id][devicenode.type][ + devicenode.stream_id].append( + (devicenode.start_ns, devicenode.end_ns)) + self.call_times[devicenode.type] += 1 + if is_communication_node: # gpu activity for communication node + GPUTimeRange[devicenode.device_id][ + TracerEventType.Communication][ + devicenode.stream_id].append(( + devicenode.start_ns, devicenode.end_ns)) + self.call_times[TracerEventType.Communication] += 1 + + for event_type, time_ranges in CPUTimeRange.items(): + time_ranges = merge_self_ranges(time_ranges, is_sorted=False) + self.CPUTimeRange[event_type] = merge_ranges( + self.CPUTimeRange[event_type], time_ranges, is_sorted=True) + for device_id, device_time_ranges in GPUTimeRange.items(): + for event_type, event_time_ranges in device_time_ranges.items(): + for stream_id, time_ranges in event_time_ranges.items(): + time_ranges = merge_self_ranges( + time_ranges, is_sorted=False) + self.GPUTimeRange[device_id][event_type] = merge_ranges( + self.GPUTimeRange[device_id][event_type], + time_ranges, + is_sorted=True) + + for event_type, time_ranges in self.CPUTimeRange.items(): + self.CPUTimeRangeSum[event_type] = sum_ranges(time_ranges) + for device_id, device_time_ranges in self.GPUTimeRange.items(): + for event_type, time_ranges in device_time_ranges.items(): + self.GPUTimeRangeSum[device_id][event_type] = sum_ranges( + time_ranges) + + def get_gpu_devices(self): + return self.GPUTimeRange.keys() + + def get_gpu_range_sum(self, device_id, event_type): + return self.GPUTimeRangeSum[device_id][event_type] + + def get_cpu_range_sum(self, event_type): + return self.CPUTimeRangeSum[event_type] + + +class EventSummary: + r""" + Analyse operator event in profiling data, correlate with its device event. + """ + + class DeviceItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_item(self, node): + self.call += 1 + self.add_gpu_time(node.end_ns - node.start_ns) + + class OperatorItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.gpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + self.devices = {} + self.operator_inners = {} + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + for child in node.children_node: + if child.name not in self.operator_inners: + self.operator_inners[ + child.name] = EventSummary.OperatorItem(child.name) + self.operator_inners[child.name].add_item(child) + + for runtimenode in node.runtime_node: + for devicenode in runtimenode.device_node: + if devicenode.name not in self.devices: + self.devices[devicenode.name] = EventSummary.DeviceItem( + devicenode.name) + self.devices[devicenode.name].add_item(devicenode) + + class GeneralItem: + def __init__(self, name): + self.name = name + self.call = 0 + self.cpu_time = 0 + self.max_cpu_time = 0 + self.min_cpu_time = float('inf') + self.gpu_time = 0 + self.max_gpu_time = 0 + self.min_gpu_time = float('inf') + + @property + def avg_cpu_time(self): + return self.cpu_time / self.call + + @property + def avg_gpu_time(self): + return self.gpu_time / self.call + + def add_cpu_time(self, time): + if time > self.max_cpu_time: + self.max_cpu_time = time + if time < self.min_cpu_time: + self.min_cpu_time = time + self.cpu_time += time + + def add_gpu_time(self, time): + if time > self.max_gpu_time: + self.max_gpu_time = time + if time < self.min_gpu_time: + self.min_gpu_time = time + self.gpu_time += time + + def add_call(self): + self.call += 1 + + def add_item(self, node): + self.add_call() + self.add_cpu_time(node.cpu_time) + self.add_gpu_time(node.gpu_time) + + def __init__(self): + self.items = {} # for operator summary + self.thread_items = collections.defaultdict( + dict) # for operator summary + self.userdefined_items = {} # for userdefined summary + self.userdefined_thread_items = collections.defaultdict( + dict) # for userdefined summary + self.model_perspective_items = {} # for model summary + self.memory_manipulation_items = {} # for memory manipulation summary + + def parse(self, nodetrees): + r""" + Analysis operator event in the nodetress. + """ + node_statistic_trees, thread2host_statistic_nodes = wrap_tree(nodetrees) + for threadid, host_statistic_nodes in thread2host_statistic_nodes.items( + ): + for host_statistic_node in host_statistic_nodes[ + 1:]: #skip root node + if host_statistic_node.type == TracerEventType.Operator: + self.add_operator_item(host_statistic_node) + if host_statistic_node.type == TracerEventType.UserDefined\ + or host_statistic_node.type == TracerEventType.PythonUserDefined: + if 'memcpy' in host_statistic_node.name.lower() or 'memorycopy' in host_statistic_node.name.lower()\ + or 'memset' in host_statistic_node.name.lower(): + self.add_memory_manipulation_item(host_statistic_node) + else: + self.add_userdefined_item(host_statistic_node) + + for threadid, root_statistic_node in node_statistic_trees.items(): + deque = collections.deque() + deque.append(root_statistic_node) + while deque: + current_node = deque.popleft() + for child in current_node.children_node: + if child.type == TracerEventType.Forward or child.type == TracerEventType.Dataloader\ + or child.type == TracerEventType.Backward or child.type == TracerEventType.Optimization: + self.add_model_perspective_item( + child) #find first model perspective node + else: + deque.append(child) + + def add_operator_item(self, operator_node): + if operator_node.name not in self.items: + self.items[operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + + self.items[operator_node.name].add_item(operator_node) + + if operator_node.name not in self.thread_items[operator_node.thread_id]: + self.thread_items[operator_node.thread_id][ + operator_node.name] = EventSummary.OperatorItem( + operator_node.name) + self.thread_items[operator_node.thread_id][operator_node.name].add_item( + operator_node) + + def add_userdefined_item(self, userdefined_node): + if userdefined_node.name not in self.userdefined_items: + self.userdefined_items[ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + + self.userdefined_items[userdefined_node.name].add_item(userdefined_node) + + if userdefined_node.name not in self.userdefined_thread_items[ + userdefined_node.thread_id]: + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name] = EventSummary.GeneralItem( + userdefined_node.name) + self.userdefined_thread_items[userdefined_node.thread_id][ + userdefined_node.name].add_item(userdefined_node) + + def add_memory_manipulation_item(self, memory_manipulation_node): + if memory_manipulation_node.name not in self.memory_manipulation_items: + self.memory_manipulation_items[ + memory_manipulation_node.name] = EventSummary.GeneralItem( + memory_manipulation_node.name) + self.memory_manipulation_items[memory_manipulation_node.name].add_item( + memory_manipulation_node) + + def add_model_perspective_item(self, model_perspective_node): + if model_perspective_node.type == TracerEventType.Forward: + name = 'Forward' + elif model_perspective_node.type == TracerEventType.Backward: + name = 'Backward' + elif model_perspective_node.type == TracerEventType.Optimization: + name = 'Optimization' + elif model_perspective_node.type == TracerEventType.Dataloader: + name = 'Dataloader' + else: + return + if name not in self.model_perspective_items: + self.model_perspective_items[name] = EventSummary.GeneralItem(name) + self.model_perspective_items[name].add_item(model_perspective_node) + + +class StatisticData: + r""" + Hold all analysed results. + """ + + def __init__(self, node_trees, extra_info): + self.node_trees = node_trees + self.extra_info = extra_info + self.time_range_summary = TimeRangeSummary() + self.event_summary = EventSummary() + self.time_range_summary.parse(node_trees) + self.event_summary.parse(node_trees) + + +def _build_table(statistic_data, + sorted_by=SortedKeys.CPUTotal, + op_detail=True, + thread_sep=False, + time_unit='ms', + row_limit=100, + max_src_column_width=75): + """Prints a summary of events.""" + # format table row + SPACING_SIZE = 2 + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + def add_column(padding, text_dir='<'): + row_format_list[0] += '{: ' + text_dir + str(padding) + '}' + ( + ' ' * SPACING_SIZE) + header_sep_list[0] += '-' * padding + (' ' * SPACING_SIZE) + line_length_list[0] += padding + SPACING_SIZE + + def add_title(padding, text): + left_length = padding - len(text) + half = left_length // 2 + return '-' * half + text + '-' * (left_length - half) + + result = [] + + def append(s): + result.append(s) + result.append('\n') + + def format_time(time, unit='ms', indent=0): + r""" + Transform time in ns to time in unit. + """ + if time == float('inf'): + return '-' + else: + result = float(time) + if unit == 's': + result /= 1e9 + elif unit == 'ms': + result /= 1e6 + elif unit == 'us': + result /= 1e3 + return '{}{:.2f}'.format(' ' * indent, result) + + def format_ratio(ratio, indent=0): + r""" + Transform ratio within [0, 1] to percentage presentation. + """ + return '{}{:.2f}'.format(' ' * indent, ratio * 100) + + total_time = statistic_data.time_range_summary.get_cpu_range_sum( + TracerEventType.ProfileStep) + ###### Print Device Summary ###### + headers = ['Device', 'Utilization (%)'] + name_column_width = 30 + DEFAULT_COLUMN_WIDTH = 20 + add_column(name_column_width) + for _ in headers[1:]: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + + append(add_title(line_length, "Device Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'CPU(Process)', format_ratio( + float(statistic_data.extra_info['Process Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + row_values = [ + 'CPU(System)', format_ratio( + float(statistic_data.extra_info['System Cpu Utilization'])) + ] + append(row_format.format(*row_values)) + for gpu_name in statistic_data.time_range_summary.get_gpu_devices(): + gpu_time = float( + statistic_data.time_range_summary.get_gpu_range_sum( + gpu_name, TracerEventType.Kernel)) + utilization = gpu_time / total_time + row_values = ['GPU{}'.format(gpu_name), format_ratio(utilization)] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nCPU(Process) Utilization = Current process CPU time over all cpu cores / elapsed time, so max utilization can be reached 100% * number of cpu cores.\n" + "CPU(System) Utilization = All processes CPU time over all cpu cores(busy time) / (busy time + idle time).\n" + "GPU Utilization = Current process GPU time / elapsed time") + append('-' * line_length) + append('') + append('') + + if total_time == 0: + return ''.join(result) + + ###### Print Overview Summary ###### + headers = ['Event Type', 'CPU Time', 'Ratio (%)'] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + + DEFAULT_COLUMN_WIDTH = 25 + for _ in headers: + add_column(DEFAULT_COLUMN_WIDTH) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Overview Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + row_values = [ + 'Total Time', format_time( + total_time, unit=time_unit), format_ratio(1) + ] + append(row_format.format(*row_values)) + cpu_type_time = collections.defaultdict(int) + gpu_type_time = collections.defaultdict(int) + for event_type, value in statistic_data.time_range_summary.CPUTimeRangeSum.items( + ): + cpu_type_time[event_type] = value + + gpu_time_range = collections.defaultdict(list) + for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items( + ): + for event_type, time_range in device_time_ranges.items(): + gpu_time_range[event_type] = merge_ranges( + gpu_time_range[event_type], time_range, is_sorted=True) + for event_type, time_range in gpu_time_range.items(): + gpu_type_time[event_type] = sum_ranges(time_range) + + sorted_items = sorted( + cpu_type_time.items(), key=lambda x: x[1], reverse=True) + for event_type, time in sorted_items: + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + append(header_sep) + headers = ['', 'GPU Time', 'Ratio (%)'] + append(row_format.format(*headers)) + append(header_sep) + for event_type, time in gpu_type_time.items(): + row_values = [ + ' {}'.format(str(event_type).split('.')[1]), format_time( + time, unit=time_unit), format_ratio(float(time) / total_time) + ] + append(row_format.format(*row_values)) + + append(header_sep) + append( + "Note:\nIn this table, We sum up all collected events in terms of event type.\n" + "The time of events collected on host are presented as CPU Time, and as GPU Time if on device.\n" + "ratio = CPU(GPU) Time / Total Time." + "Events with different types may overlap or inclusion, e.g. Operator includes OperatorInner, so the sum of ratios is not 100%.\n" + "The time of events in the same type with overlap will not calculate twice, and all time is summed after merged.\n" + "Example:\n" + "Thread 1:\n" + " Operator: |___________| |__________|\n" + "Thread 2:\n" + " Operator: |____________| |___|\n" + "After merged:\n" + " Result: |______________| |__________|\n") + append('-' * line_length) + append('') + append('') + + ###### Print Operator Summary Report ###### + if statistic_data.event_summary.items: + headers = [ + 'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)', + 'GPU Total / Avg / Max / Min / Ratio(%)' + ] + row_format_list = [""] + header_sep_list = [""] + line_length_list = [-SPACING_SIZE] + name_column_width = 50 + add_column(name_column_width) + add_column(6) + add_column(40) + add_column(40) + + row_format = row_format_list[0] + header_sep = header_sep_list[0] + line_length = line_length_list[0] + + # construct table string + append(add_title(line_length, "Operator Summary")) + append('Time unit: {}'.format(time_unit)) + append(header_sep) + append(row_format.format(*headers)) + append(header_sep) + if thread_sep == True: + thread_items = statistic_data.event_summary.thread_items + else: + thread_items = { + 'All threads merged': statistic_data.event_summary.items + } + for thread_id, items in thread_items.items(): + append(add_title(line_length, "Thread: {}".format(thread_id))) + if sorted_by == SortedKeys.CPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].cpu_time, reverse=True) + elif sorted_by == SortedKeys.CPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_cpu_time, + reverse=True) + elif sorted_by == SortedKeys.CPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_cpu_time) + elif sorted_by == SortedKeys.GPUTotal: + sorted_items = sorted( + items.items(), key=lambda x: x[1].gpu_time, reverse=True) + elif sorted_by == SortedKeys.GPUAvg: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].avg_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMax: + sorted_items = sorted( + items.items(), + key=lambda x: x[1].max_gpu_time, + reverse=True) + elif sorted_by == SortedKeys.GPUMin: + sorted_items = sorted( + items.items(), key=lambda x: x[1].min_gpu_time) + + total_cpu_time = 0 + total_gpu_time = 0 + for name, item in sorted_items: + total_cpu_time += item.cpu_time + total_gpu_time += item.gpu_time + for name, item in sorted_items: + row_values = [ + name, item.call, '{} / {} / {} / {} / {}'.format( + format_time( + item.cpu_time, unit=time_unit), + format_time( + item.avg_cpu_time, unit=time_unit), + format_time( + item.max_cpu_time, unit=time_unit), + format_time( + item.min_cpu_time, unit=time_unit), + format_ratio(float(item.cpu_time) / total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + item.gpu_time, unit=time_unit), + format_time( + item.avg_gpu_time, unit=time_unit), + format_time( + item.max_gpu_time, unit=time_unit), + format_time( + item.min_gpu_time, unit=time_unit), + format_ratio(float(item.gpu_time) / total_gpu_time)) + ] + append(row_format.format(*row_values)) + if op_detail: + for innerop_name, innerop_node in item.operator_inners.items( + ): + row_values = [ + ' {}'.format(innerop_name), innerop_node.call, + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.cpu_time, unit=time_unit), + format_time( + innerop_node.avg_cpu_time, unit=time_unit), + format_time( + innerop_node.max_cpu_time, unit=time_unit), + format_time( + innerop_node.min_cpu_time, unit=time_unit), + format_ratio( + float(innerop_node.cpu_time) / + total_cpu_time)), + '{} / {} / {} / {} / {}'.format( + format_time( + innerop_node.gpu_time, unit=time_unit), + format_time( + innerop_node.avg_gpu_time, unit=time_unit), + format_time( + innerop_node.max_gpu_time, unit=time_unit), + format_time( + innerop_node.min_gpu_time, unit=time_unit), + format_ratio( + float(innerop_node.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, devicenode in innerop_node.devices.items( + ): + if len(device_node_name) + 4 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 7] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), + devicenode.call, '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, + unit=time_unit), + format_time( + devicenode.max_gpu_time, + unit=time_unit), + format_time( + devicenode.min_gpu_time, + unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + for device_node_name, device_node in item.devices.items(): + if len(device_node_name) + 2 > name_column_width: + device_node_name = device_node_name[: + name_column_width + - 5] + device_node_name += "..." + row_values = [ + ' {}'.format(device_node_name), devicenode.call, + '- / - / - / - / -', + '{} / {} / {} / {} / {}'.format( + format_time( + devicenode.gpu_time, unit=time_unit), + format_time( + devicenode.avg_gpu_time, unit=time_unit), + format_time( + devicenode.max_gpu_time, unit=time_unit), + format_time( + devicenode.min_gpu_time, unit=time_unit), + format_ratio( + float(devicenode.gpu_time) / + total_gpu_time)) + ] + append(row_format.format(*row_values)) + append(header_sep) + append('') + append('') + return ''.join(result) From 688743bf7ce7846873481dc5fdc2454c6e2de4f6 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 8 Mar 2022 21:22:17 +0800 Subject: [PATCH 36/50] Rename phi::func::TensorReduceImpl to phi::func::ReduceKernel. (#40183) --- .../fluid/operators/reduce_ops/reduce_op.cu.h | 4 +-- paddle/phi/kernels/funcs/matrix_reduce.cu | 9 ++---- paddle/phi/kernels/funcs/reduce_function.h | 12 ++++---- .../gpu/broadcast_tensors_grad_kernel.cu | 5 ++-- paddle/phi/kernels/gpu/compare_kernel.cu | 4 +-- paddle/phi/kernels/gpu/elementwise_grad.h | 29 +++++++------------ paddle/phi/kernels/gpu/reduce.h | 24 +++++---------- ...d_cross_entropy_with_logits_grad_kernel.cu | 17 +++-------- ...igmoid_cross_entropy_with_logits_kernel.cu | 18 +++--------- paddle/phi/kernels/gpu/trace_kernel.cu | 5 ++-- .../kernels/impl/matmul_grad_kernel_impl.h | 5 ++-- 11 files changed, 44 insertions(+), 88 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index eb76eee104889..160617695338a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::funcs::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/phi/kernels/funcs/matrix_reduce.cu b/paddle/phi/kernels/funcs/matrix_reduce.cu index 5e288c6e9c217..5c3ebd6bb0167 100644 --- a/paddle/phi/kernels/funcs/matrix_reduce.cu +++ b/paddle/phi/kernels/funcs/matrix_reduce.cu @@ -45,13 +45,8 @@ class MatrixReduceSumFunctor { out_reduce_dims.push_back(idx); } } - TensorReduceImpl>( - dev_ctx, - in, - out, - kps::IdentityFunctor(), - out_reduce_dims, - dev_ctx.stream()); + ReduceKernel>( + dev_ctx, in, out, kps::IdentityFunctor(), out_reduce_dims); } }; diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h index ce6bb0d559c81..5834f091d9a4d 100644 --- a/paddle/phi/kernels/funcs/reduce_function.h +++ b/paddle/phi/kernels/funcs/reduce_function.h @@ -1087,12 +1087,12 @@ template class ReduceOp, typename TransformOp> -void TensorReduceImpl(const phi::GPUContext& dev_ctx, - const phi::DenseTensor& x, - phi::DenseTensor* y, - const TransformOp& transform, - const std::vector& origin_reduce_dims, - KPStream stream) { +void ReduceKernel(const phi::GPUContext& dev_ctx, + const phi::DenseTensor& x, + phi::DenseTensor* y, + const TransformOp& transform, + const std::vector& origin_reduce_dims) { + auto stream = dev_ctx.stream(); dev_ctx.Alloc(y); auto x_dim = phi::vectorize(x.dims()); diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 926dffc7450dc..d4850b74477d2 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -87,13 +87,12 @@ void BroadcastTensorsGradKernel(const Context& ctx, *input_tensor, ctx.GetPlace(), ctx, output_tensor); } else { // reduce_sum implementation on CUDA - funcs::TensorReduceImpl>( + funcs::ReduceKernel>( ctx, *input_tensor, output_tensor, kps::IdentityFunctor(), - reduce_dims_vec, - ctx.stream()); + reduce_dims_vec); } } } diff --git a/paddle/phi/kernels/gpu/compare_kernel.cu b/paddle/phi/kernels/gpu/compare_kernel.cu index 9c02627e5463b..225164687b75c 100644 --- a/paddle/phi/kernels/gpu/compare_kernel.cu +++ b/paddle/phi/kernels/gpu/compare_kernel.cu @@ -80,8 +80,8 @@ inline void CompareAllKernelImpl(const Context& ctx, for (int i = 0; i < reduce_dims.size(); ++i) { reduce_dims[i] = i; } - funcs::TensorReduceImpl>( - ctx, tmp, out, kps::IdentityFunctor(), reduce_dims, ctx.stream()); + funcs::ReduceKernel>( + ctx, tmp, out, kps::IdentityFunctor(), reduce_dims); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index b356f19555fc4..98df65c92f34c 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -29,13 +29,8 @@ void ReduceWrapper(const GPUContext &dev_ctx, DenseTensor *dst) { std::vector reduce_dims = funcs::GetReduceDim(dst->dims(), src->dims(), axis); - funcs::TensorReduceImpl>( - dev_ctx, - *src, - dst, - kps::IdentityFunctor(), - reduce_dims, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims); } template @@ -172,9 +167,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -187,9 +181,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::IdentityFunctor(), reduce_dims); } } } @@ -285,9 +278,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dx, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dx, kps::IdentityFunctor(), reduce_dims); } } // dy @@ -306,9 +298,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } else { std::vector reduce_dims = funcs::GetReduceDim(y.dims(), out.dims(), axis); - gpuStream_t stream = ctx.stream(); - funcs::TensorReduceImpl>( - ctx, dout, dy, kps::InverseFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, dout, dy, kps::InverseFunctor(), reduce_dims); } } } diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h index 0319de7558e82..da5315f34479f 100644 --- a/paddle/phi/kernels/gpu/reduce.h +++ b/paddle/phi/kernels/gpu/reduce.h @@ -39,8 +39,6 @@ void Reduce(const KPDevice& dev_ctx, reduce_num *= (x.dims())[i]; } - KPStream stream = dev_ctx.stream(); - if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) { auto tmp_tensor = phi::Cast(dev_ctx, x, out_dtype); PD_VISIT_BOOL_AND_FLOATING_AND_COMPLEX_AND_3_TYPES( @@ -48,29 +46,23 @@ void Reduce(const KPDevice& dev_ctx, phi::DataType::INT64, phi::DataType::FLOAT16, out_dtype, - "TensorReduceImpl", + "ReduceKernel", ([&] { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( + phi::funcs::ReduceKernel>( dev_ctx, tmp_tensor, out, TransformOp(reduce_num), - reduce_dims, - stream); + reduce_dims); })); } else { using MPType = typename kps::details::MPTypeTrait::Type; - phi::funcs::TensorReduceImpl>( - dev_ctx, - x, - out, - TransformOp(reduce_num), - reduce_dims, - stream); + phi::funcs::ReduceKernel>( + dev_ctx, x, out, TransformOp(reduce_num), reduce_dims); } } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu index 598b0138fb3a1..6fc65006ae264 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_grad_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(in_grad->dims()); - int limit = in_grad->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label, &out_grad}; std::vector outs = {in_grad, counts_tensor}; auto functor = SigmoidBwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,6 +104,7 @@ void SigmoidCrossEntropyWithLogitsGradKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu index 13d63f8d97e42..4b6e5628c72af 100644 --- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu +++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits_kernel.cu @@ -69,17 +69,12 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, dev_ctx.template Alloc(counts_tensor); counts_tensor->Resize(out->dims()); - int limit = out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; std::vector ins = {&x, &label}; std::vector outs = {out, counts_tensor}; auto functor = SigmoidFwdFunctor(ignore_index); - constexpr int Size = 2; - phi::funcs::ElementwiseKernel( + phi::funcs::ElementwiseKernel( dev_ctx, ins, &outs, functor); if (normalize) { - T *counts = dev_ctx.template Alloc(counts_tensor); DenseTensor *norm_tensor = new DenseTensor(); norm_tensor->Resize({sizeof(T)}); dev_ctx.template Alloc(norm_tensor); @@ -89,13 +84,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, reduce_dim.push_back(i); } - funcs::TensorReduceImpl>( - dev_ctx, - *counts_tensor, - norm_tensor, - NonzeroFunctor(), - reduce_dim, - dev_ctx.stream()); + funcs::ReduceKernel>( + dev_ctx, *counts_tensor, norm_tensor, NonzeroFunctor(), reduce_dim); T *norm = dev_ctx.template Alloc(norm_tensor); auto norm_cpu_mem = paddle::memory::Alloc(phi::CPUPlace(), sizeof(T)); T *norm_cpu_ptr = reinterpret_cast(norm_cpu_mem->ptr()); @@ -114,8 +104,8 @@ void SigmoidCrossEntropyWithLogitsKernel(const Context &dev_ctx, phi::funcs::ElementwiseKernel(dev_ctx, div_ins, &div_outs, div_functor); delete norm_tensor; - delete counts_tensor; } + delete counts_tensor; } } // namespace phi diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 4266f0174ff6c..4a749c5b3347d 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -31,11 +31,10 @@ void TraceKernel(const Context& ctx, T* out_data = ctx.template Alloc(out); auto diag = funcs::Diagonal(ctx, &x, offset, axis1, axis2); if (diag.numel() > 0) { - auto stream = ctx.stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - funcs::TensorReduceImpl>( - ctx, diag, out, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + ctx, diag, out, kps::IdentityFunctor(), reduce_dims); } else { phi::funcs::SetConstant functor; functor(ctx, out, static_cast(0)); diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index d06bdc5503056..495b93f2a4ef0 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -59,9 +59,8 @@ struct ReduceSumForMatmulGrad { const DenseTensor& input, DenseTensor* output, const std::vector& reduce_dims) { - auto stream = dev_ctx.stream(); - funcs::TensorReduceImpl>( - dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims, stream); + funcs::ReduceKernel>( + dev_ctx, input, output, kps::IdentityFunctor(), reduce_dims); } }; #endif From e548f65f96697830035a28f9070b40829408ccdb Mon Sep 17 00:00:00 2001 From: Roc <30228238+sljlp@users.noreply.github.com> Date: Tue, 8 Mar 2022 22:26:02 +0800 Subject: [PATCH 37/50] support ema optimizer in sharding optimizers (#39860) --- .../paddle/distributed/fleet/meta_optimizers/sharding/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index d04a3a53db3e2..b42f21989abd7 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -901,9 +901,10 @@ def save_persistables(exe, dirname, main_program, filename=None): def is_opt_vars(var): # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer # now only Momentum and adam are compatible with sharding + # support EMA optimizer checks = [ "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0", - "_velocity_0" + "_velocity_0", "_ema_0" ] for check in checks: if var.name.endswith(check) and var.persistable: From fcae3430808576c6a143562410f2527cc793bc70 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Wed, 9 Mar 2022 10:10:55 +0800 Subject: [PATCH 38/50] fix take_along_axis cuda op register bug (#40270) * fix take_along_axis cuda op register bug * add comma after float Co-authored-by: Chen Weihang --- paddle/phi/kernels/gpu/take_along_axis_kernel.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 63113e3e672f3..9665a917d9dc4 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -53,6 +53,7 @@ PD_REGISTER_KERNEL(take_along_axis, GPU, ALL_LAYOUT, phi::TakeAlongAxisKernel, + float, double, int64_t, int, From fb4215b2d1765e305f687d2d1ca5f19c90f7eeb1 Mon Sep 17 00:00:00 2001 From: JingZhuangzhuang <75348594+JZZ-NOTE@users.noreply.github.com> Date: Wed, 9 Mar 2022 10:21:50 +0800 Subject: [PATCH 39/50] fix batch_norm op kernel (#40171) --- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index 6ad12245d2a45..49b550f51e60e 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -460,10 +460,14 @@ void BatchNormKernel(const Context &ctx, void *reserve_space_ptr = nullptr; void *workspace_ptr = nullptr; DenseTensor workspace_tensor; + DenseTensor reserve_space_tensor; // Create reserve space and workspace for batch norm. // Create tensor for each batchnorm op, it will be used in the // backward. Thus this tensor shouldn't be temp. // auto *reserve_space = ctx.Output("ReserveSpace"); + if (reserve_space == nullptr) { + reserve_space = &reserve_space_tensor; + } PADDLE_ENFORCE_NOT_NULL( reserve_space, phi::errors::NotFound( From 8031a4dc8b05dcfee95af2ca613fc736fc7f9830 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 9 Mar 2022 10:27:30 +0800 Subject: [PATCH 40/50] [Phi] move Reduce max kernel into phi (#40225) * add reduce_max kernel * add reduce max kernel * update reduce max Argumentmapping * remove reduce_max kernel * remove reduce_max kernel * add reduce max infermeta * rename reduce infermeta --- .../operators/reduce_ops/reduce_max_op.cc | 31 ++++++++---- .../operators/reduce_ops/reduce_max_op.cu | 23 --------- .../operators/reduce_ops/reduce_mean_op.cc | 2 +- .../operators/reduce_ops/reduce_sum_op.cc | 2 +- paddle/phi/core/compat/op_utils.h | 1 + paddle/phi/infermeta/unary.cc | 50 ++++++++++++------- paddle/phi/infermeta/unary.h | 22 ++++---- paddle/phi/kernels/cpu/reduce_max_kernel.cc | 39 +++++++++++++++ paddle/phi/kernels/funcs/reduce_functor.h | 8 +++ paddle/phi/kernels/gpu/reduce_max_kernel.cu | 37 ++++++++++++++ paddle/phi/kernels/math_kernel.h | 2 +- paddle/phi/kernels/reduce_max_kernel.cc | 39 +++++++++++++++ paddle/phi/kernels/reduce_max_kernel.h | 38 ++++++++++++++ paddle/phi/ops/compat/reduce_sig.cc | 24 ++++++++- python/paddle/utils/code_gen/api.yaml | 2 +- 15 files changed, 252 insertions(+), 68 deletions(-) delete mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.cu create mode 100644 paddle/phi/kernels/cpu/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/gpu/reduce_max_kernel.cu create mode 100644 paddle/phi/kernels/reduce_max_kernel.cc create mode 100644 paddle/phi/kernels/reduce_max_kernel.h diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a80572..41df8e4a15f09 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu deleted file mode 100644 index 8194805ddc373..0000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_max -REGISTER_OP_CUDA_KERNEL( - reduce_max, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 894106883cb0a..4a18330913803 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -97,7 +97,7 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PD_INFER_META(phi::MeanRawInferMeta)); + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 6559ed479c84c..6441d53239e95 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -103,7 +103,7 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PD_INFER_META(phi::ReduceInferMetaBase)); + PD_INFER_META(phi::SumRawInferMeta)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index 9947e00ecb53c..1ab718c079438 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -47,6 +47,7 @@ const std::unordered_set deprecated_op_names({"diag", "matmul_grad", "matmul_grad_grad", "mean", + "max", "reshape", "reshape_grad", "expand", diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 17edc84618726..32744659163fd 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -406,7 +406,7 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, ReshapeInferMeta(x, shape, out, config); } -/* Why not use ReduceInferMetaBase directly? +/* Why not use SumRawInferMeta directly? Because we need make InferMetaFunction's args follow the design of api.yaml */ void SumInferMeta(const MetaTensor& x, @@ -415,15 +415,13 @@ void SumInferMeta(const MetaTensor& x, bool keep_dim, MetaTensor* out) { bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, dtype, out); + SumRawInferMeta(x, axis, keep_dim, reduce_all, dtype, out); } -void ReduceInferMetaBase(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - DataType dtype, - MetaTensor* out) { +DDim ReduceInferDim(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all) { auto x_rank = x.dims().size(); std::vector formated_axis = axis; @@ -486,6 +484,17 @@ void ReduceInferMetaBase(const MetaTensor& x, } DDim out_dim = phi::make_ddim(out_dim_vector); + return out_dim; +} + +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + DataType out_dtype; if (dtype != DataType::UNDEFINED) { out_dtype = dtype; @@ -503,20 +512,23 @@ void ReduceInferMetaBase(const MetaTensor& x, out->set_layout(x.layout()); } -void MeanRawInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - MetaTensor* out) { - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); +void ReduceInferMetaBase(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + MetaTensor* out) { + DDim out_dim = ReduceInferDim(x, axis, keep_dim, reduce_all); + out->set_dims(out_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); } -void MeanInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out) { +void ReduceInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out) { bool reduce_all = false; - ReduceInferMetaBase(x, axis, keep_dim, reduce_all, DataType::UNDEFINED, out); + ReduceInferMetaBase(x, axis, keep_dim, reduce_all, out); } void TransferLayoutInferMeta(const MetaTensor& x, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index dac7c19cf9b08..735a77faefebf 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -94,23 +94,23 @@ void ReshapeWithXShapeInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void SumRawInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + bool reduce_all, + DataType dtype, + MetaTensor* out); + void ReduceInferMetaBase(const MetaTensor& x, const std::vector& axis, bool keep_dim, bool reduce_all, - DataType dtype, MetaTensor* out); -void MeanRawInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - bool reduce_all, - MetaTensor* out); - -void MeanInferMeta(const MetaTensor& x, - const std::vector& axis, - bool keep_dim, - MetaTensor* out); +void ReduceInferMeta(const MetaTensor& x, + const std::vector& axis, + bool keep_dim, + MetaTensor* out); void SumInferMeta(const MetaTensor& x, const std::vector& axis, diff --git a/paddle/phi/kernels/cpu/reduce_max_kernel.cc b/paddle/phi/kernels/cpu/reduce_max_kernel.cc new file mode 100644 index 0000000000000..f9ea0aa0faf06 --- /dev/null +++ b/paddle/phi/kernels/cpu/reduce_max_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/cpu/reduce.h" +#include "paddle/phi/kernels/funcs/reduce_functor.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max_raw, CPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/funcs/reduce_functor.h b/paddle/phi/kernels/funcs/reduce_functor.h index aebd155ac59cb..4e83d0fa37103 100644 --- a/paddle/phi/kernels/funcs/reduce_functor.h +++ b/paddle/phi/kernels/funcs/reduce_functor.h @@ -41,5 +41,13 @@ struct ProdFunctor { } }; +//////// Max Functor /////// +struct MaxFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->maximum(dim); + } +}; + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/reduce_max_kernel.cu b/paddle/phi/kernels/gpu/reduce_max_kernel.cu new file mode 100644 index 0000000000000..98c3986c51dd6 --- /dev/null +++ b/paddle/phi/kernels/gpu/reduce_max_kernel.cu @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/reduce.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + phi::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max_raw, GPU, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/math_kernel.h b/paddle/phi/kernels/math_kernel.h index fe8f3b749cdd8..7569cbcff087d 100644 --- a/paddle/phi/kernels/math_kernel.h +++ b/paddle/phi/kernels/math_kernel.h @@ -156,7 +156,7 @@ DenseTensor Mean(const Context& dev_ctx, bool keep_dim) { DenseTensor dense_out; MetaTensor meta_out(&dense_out); - ReduceInferMetaBase(x, axis, keep_dim, false, x.dtype(), &meta_out); + SumRawInferMeta(x, axis, keep_dim, false, x.dtype(), &meta_out); MeanKernel(dev_ctx, x, axis, keep_dim, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc new file mode 100644 index 0000000000000..de172a12d7288 --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/reduce_max_kernel.h" + +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out) { + bool reduce_all = false; + MaxRawKernel(dev_ctx, x, dims, keep_dim, reduce_all, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + max, CPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_REGISTER_KERNEL( + max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {} +#endif diff --git a/paddle/phi/kernels/reduce_max_kernel.h b/paddle/phi/kernels/reduce_max_kernel.h new file mode 100644 index 0000000000000..7560473d43c71 --- /dev/null +++ b/paddle/phi/kernels/reduce_max_kernel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/unary.h" +#include "paddle/phi/kernels/empty_kernel.h" + +namespace phi { + +template +void MaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void MaxKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + DenseTensor* out); +} // namespace phi diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 92839fb303075..36798abe4c11b 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -21,7 +21,7 @@ KernelSignature ReduceSumOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "sum_raw" KernelSignature. - // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // And the InferMeta function(i.e. SumRawInferMeta) is accordance with // the "sum_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature("sum_raw", @@ -40,7 +40,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) { bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in // InferShape, so we must return the "mean_raw" KernelSignature. - // And the InferMeta function(i.e. MeanRawInferMeta) is accordance with the + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the // "mean_raw" KernelSignature if (ctx.IsForInferShape() || reduce_all) { return KernelSignature( @@ -56,11 +57,30 @@ KernelSignature ReduceProdOpArgumentMapping(const ArgumentMappingContext& ctx) { "reduce_prod", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); } +KernelSignature ReduceMaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + if (ctx.IsDenseTensorInput("X")) { + bool reduce_all = paddle::any_cast(ctx.Attr("reduce_all")); + // When ctx is InferShapeArgumentMappingContext, the reduce_all is used in + // InferShape, so we must return the "max_raw" KernelSignature. + // And the InferMeta function(i.e. ReduceInferMetaBase) is accordance with + // the + // "max_raw" KernelSignature + if (ctx.IsForInferShape() || reduce_all) { + return KernelSignature( + "max_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); + } + return KernelSignature("max", {"X"}, {"dim", "keep_dim"}, {"Out"}); + } + return KernelSignature("unregistered", {}, {}, {}); +} + } // namespace phi PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum); PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean); +PD_REGISTER_BASE_KERNEL_NAME(reduce_max, max); PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(reduce_prod, phi::ReduceProdOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(reduce_max, phi::ReduceMaxOpArgumentMapping); diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 8c68ca4d7e0e4..6c27d465cb12e 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -124,7 +124,7 @@ args : (Tensor x, int64[] axis={}, bool keep_dim=false) output : Tensor infer_meta : - func : MeanInferMeta + func : ReduceInferMeta kernel : func : mean From 041c4bca832ef342679b17783c67f5d7294b1f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=20Wei=20=28=E4=BB=BB=E5=8D=AB=29?= Date: Wed, 9 Mar 2022 10:29:37 +0800 Subject: [PATCH 41/50] build documents if public apis modified, meanwhile their samplecodes should be tested (#39728) * run document_preview when samplecodes be tested * run document_preview when samplecodes be tested * sphinx-build symbol link; and build-doc default * FLUIDDOCDIR typo * download the required configirations and some other scripts * install required python packages. * clone specified branch of docs repo, and if failed, clone the default branch * clean workspace for docs repo * use the conf.py imported by https://github.com/PaddlePaddle/docs/pull/4222/ * download and install the boscmd * Optimaze the code comments. * specify the pypi index server * only do doc-build when running in cpu mode * pull docs pr git log paddle_pr_info * install jq * force using sphinx-build under py3.7 * using our new domain name for preview * install python package error * don't build doc default --- tools/document_preview.sh | 170 ++++++++++++++++++++++++++++++++++---- tools/sampcd_processor.py | 45 ++++++++++ 2 files changed, 198 insertions(+), 17 deletions(-) diff --git a/tools/document_preview.sh b/tools/document_preview.sh index 83c758d0aa8b8..424169bbc5127 100755 --- a/tools/document_preview.sh +++ b/tools/document_preview.sh @@ -1,6 +1,6 @@ #!/bin/bash -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,19 +14,155 @@ # See the License for the specific language governing permissions and # limitations under the License. -PADDLE_ROOT=/home -mkdir ${PADDLE_ROOT} -cd ${PADDLE_ROOT} -pip install /paddle/build/opt/paddle/share/wheels/*.whl -git clone https://github.com/PaddlePaddle/FluidDoc -git clone https://github.com/tianshuo78520a/PaddlePaddle.org.git -cd ${PADDLE_ROOT}/PaddlePaddle.org -git reset 3feaa68376d8423e41d076814e901e6bf108c705 -cd ${PADDLE_ROOT}/FluidDoc/doc/fluid/api -sh gen_doc.sh -apt-get update && apt-get install -y python-dev build-essential -cd ${PADDLE_ROOT}/PaddlePaddle.org/portal -pip install -r requirements.txt -#If the default port is not occupied, you can use port 8000, you need to replace it with a random port on the CI. -sed -i "s#8000#$1#g" runserver -nohup ./runserver --paddle ${PADDLE_ROOT}/FluidDoc & +is_shell_attribute_set() { # attribute, like "x" + case "$-" in + *"$1"*) return 0 ;; + *) return 1 ;; + esac +} +function get_docs_pr_num_from_paddle_pr_info(){ + # get_repo_pr_info's output + pr_info_file=$1 + if [ ! -r ${pr_info_file} ] ; then + return 1 + fi + + declare -A arr_kv + while read line + do + echo "$line" | grep '^\w\+\s*=\s*.*' > /dev/null + if [ $? = 0 ] ; then + kv=($(echo $line | sed 's/=/\n/g')) + k=($(echo "${kv[0]}" | sed 's/\s//g')) + v=($(echo "${kv[1]}" | sed 's/^\s*//g' | sed 's/\s*$//g')) + # arr_kv[${kv[1]}]=${kv[2]} + arr_kv[${k}]=${v} + fi + done < <(jq -r '.body' ${pr_info_file}) + + echo ${arr_kv[PADDLEDOCS_PR]} + return 0 +} + +# Attention: +# 1. /FluidDoc will be used as the workspace of PaddlePaddle/docs. +# 2. And /docs is used as the output of doc-build process. +# 3. If conflicted with yours, please modify the defination of FLUIDDOCDIR and +# OUTPUTDIR in the subsequent codes. +# 4. The doc-build process is controlled under EnvVar BUILD_DOC and UPLOAD_DOC. +# All the Chinese and English docs will be generated, and then uploaded. + +PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: None" +BUILD_DOC=${BUILD_DOC:=false} +UPLOAD_DOC=${UPLOAD_DOC:=false} + +CURPWD=${PWD} + +if [ -f /usr/local/python3.7.0/bin/sphinx-build ] ; then + if [ -f /usr/local/bin/sphinx-build ] ; then + rm /usr/local/bin/sphinx-build + fi + ln -s /usr/local/python3.7.0/bin/sphinx-build /usr/local/bin/sphinx-build +fi + +if [ "${BUILD_DOC}" = "true" ] && [ -x /usr/local/bin/sphinx-build ] ; then + export FLUIDDOCDIR=${FLUIDDOCDIR:=/FluidDoc} + export OUTPUTDIR=${OUTPUTDIR:=/docs} + export VERSIONSTR=$(echo ${BRANCH} | sed 's@release/@@g') + + if [ -d ${FLUIDDOCDIR} ] ; then + echo "${FLUIDDOCDIR} exists, git clone will be skipped, but git clean will be done." + cd ${FLUIDDOCDIR} + git reset --hard + git clean -dfx + cd ${CURPWD} + else + git clone -b ${BRANCH} --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR} + if [ ! "$?" = "0" ] ; then + git clone --depth=1 https://github.com/PaddlePaddle/docs.git ${FLUIDDOCDIR} + fi + fi + if [ -d ${OUTPUTDIR} ] ; then + echo "$0: rm -rf ${OUTPUTDIR}" + rm -rf ${OUTPUTDIR} + mkdir -p ${OUTPUTDIR} + fi + + # install requirements + export no_proxy=mirror.baidu.com,${no_proxy} + apt-get install -y --no-install-recommends doxygen jq + echo 'beautifulsoup4 +Markdown +sphinx-sitemap +sphinx-markdown-tables +breathe +exhale +sphinx_design +nbsphinx +' >/tmp/doc-build.requirements && \ + pip install --no-cache-dir -i https://mirror.baidu.com/pypi/simple -r /tmp/doc-build.requirements && \ + rm /tmp/doc-build.requirements + + + source ${FLUIDDOCDIR}/ci_scripts/utils.sh + paddle_pr_info=$(get_repo_pr_info "PaddlePaddle/Paddle" ${GIT_PR_ID}) + docs_pr_id=$(get_docs_pr_num_from_paddle_pr_info ${paddle_pr_info}) + if [ -n "${docs_pr_id}" ] ; then + cd ${FLUIDDOCDIR} + git fetch --depth=1 origin pull/${docs_pr_id}/head + git checkout -b "pr${docs_pr_id}" FETCH_HEAD + git log --pretty=oneline -10 + fi + echo "docs_pr_id=${docs_pr_id}" + + + # build doc + /bin/bash -x ${FLUIDDOCDIR}/ci_scripts/gendoc.sh + if [ $? -ne 0 ];then + echo 'gendoc error' + exit 1 + fi + + if [ "${UPLOAD_DOC}" = "true" ] ; then + curl -o /tmp/linux-bcecmd-0.3.0.zip https://sdk.bce.baidu.com/console-sdk/linux-bcecmd-0.3.0.zip && \ + python -m zipfile -e /tmp/linux-bcecmd-0.3.0.zip /opt && \ + chmod +x /opt/linux-bcecmd-0.3.0/bcecmd && \ + rm /tmp/linux-bcecmd-0.3.0.zip && \ + curl -o /tmp/boscmdconfig.tgz https://paddle-dev-tools-open.bj.bcebos.com/fluiddoc-preview/boscmdconfig.tgz && \ + tar xzf /tmp/boscmdconfig.tgz -C /opt/linux-bcecmd-0.3.0/ && \ + rm /tmp/boscmdconfig.tgz + + # credentials file is empty, please build it if need. + BCECMD=/opt/linux-bcecmd-0.3.0/bcecmd + BCECMD_CONFIG=/opt/linux-bcecmd-0.3.0/boscmdconfig + + is_shell_attribute_set x + xdebug_setted=$? + if [ $xdebug_setted ] ; then + set +x + fi + if [ -n "${BOS_CREDENTIAL_AK}" ] && [ -n "${BOS_CREDENTIAL_SK}" ] ; then + echo "Ak = ${BOS_CREDENTIAL_AK}" >> ${BCECMD_CONFIG}/credentials + echo "Sk = ${BOS_CREDENTIAL_SK}" >> ${BCECMD_CONFIG}/credentials + fi + if [ $xdebug_setted ] ; then + set -x + fi + + PREVIEW_JOB_NAME="preview-paddle-pr-${GIT_PR_ID}" + BOSBUCKET=${BOSBUCKET:=paddle-site-web-dev} + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/en/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/en/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/en/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/" + ${BCECMD} --conf-path ${BCECMD_CONFIG} bos sync "${OUTPUTDIR}/zh/${VERSIONSTR}" "bos:/${BOSBUCKET}/documentation/zh/${PREVIEW_JOB_NAME}" \ + --delete --yes --exclude "${OUTPUTDIR}/zh/${VERSIONSTR}/_sources/" + PREVIEW_URL_PROMPT="ipipe_log_param_preview_url: http://${PREVIEW_JOB_NAME}.${PREVIEW_SITE:-paddle.run}/documentation/docs/zh/api/index_cn.html" + fi +fi + +cd ${CURPWD} +# print the preview url +echo "${PREVIEW_URL_PROMPT}" diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index d8cb70c9dd107..2d8692c5bc7e5 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -550,6 +550,42 @@ def get_incrementapi(): f.write('\n') +def exec_gen_doc(): + result = True + cmd = ["bash", "document_preview.sh"] + logger.info("----exec gen_doc----") + start_time = time.time() + subprc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, error = subprc.communicate() + msg = "".join(output.decode(encoding='utf-8')) + err = "".join(error.decode(encoding='utf-8')) + end_time = time.time() + + if subprc.returncode != 0: + logger.info("----gen_doc msg----") + logger.info(msg) + logger.error("----gen_doc error msg----") + logger.error(err) + logger.error("----exec gen_doc failed----") + result = False + else: + logger.info("----gen_doc msg----") + logger.info(msg) + logger.info("----exec gen_doc success----") + + for fn in [ + '/docs/en/develop/index_en.html', '/docs/zh/develop/index_cn.html' + ]: + if os.path.exists(fn): + logger.info('%s exists.', fn) + else: + logger.error('%s not exists.', fn) + + # msg is the returned code execution report + return result, msg, end_time - start_time + + arguments = [ # flags, dest, type, default, help ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'], @@ -570,6 +606,11 @@ def parse_args(): parser.add_argument('--debug', dest='debug', action="store_true") parser.add_argument('--full-test', dest='full_test', action="store_true") parser.add_argument('mode', type=str, help='run on device', default='cpu') + parser.add_argument( + '--build-doc', + dest='build_doc', + action='store_true', + help='build doc if need.') for item in arguments: parser.add_argument( item[0], dest=item[1], help=item[4], type=item[2], default=item[3]) @@ -702,3 +743,7 @@ def parse_args(): exit(1) logger.info("Sample code check is successful!") + + if args.mode == "cpu": + # As cpu mode is also run with the GPU whl, so skip it in gpu mode. + exec_gen_doc() From b5a8a0d96b594ae16ae95b645aa38e3bbc78ec76 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Wed, 9 Mar 2022 11:22:21 +0800 Subject: [PATCH 42/50] [MLU] add mlu buffer reader (#40131) --- .../fluid/operators/reader/buffered_reader.cc | 68 +++++++++++++++++++ .../fluid/operators/reader/buffered_reader.h | 12 ++++ .../fluid/platform/stream_callback_manager.cc | 8 +-- 3 files changed, 84 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602a3..4b6759ea165ed 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df88..f0f3b6b7f9fdf 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 7fce0296d437a..7148afee273fd 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -80,10 +80,10 @@ void StreamCallbackManager::AddCallback( #endif #if PADDLE_WITH_MLU - VLOG(3) << "MLULaunchCallback at stream: " << stream_; - LOG(ERROR) << "failed to call MLULaunchCallback, " - << "because mlu not support StreamAddCallback yet. " - << "function: " << func; + VLOG(3) << "MLULaunchCallback at stream: " << stream_ + << " Failed to call MLULaunchCallback, " + << "because mlu not support StreamAddCallback yet. " + << "function: " << func; #endif } From 86effa0ce1309ea27f29af6a28dd5bb3d4aa1ac5 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:23:02 +0800 Subject: [PATCH 43/50] [IPU] update ipu unittests p3 (#40072) * update ipu UTs part3 * rename uts * sync api changes * update uts for new api * update use_ipumodel() * split pr --- .../unittests/ipu/test_matmul_v2_op_ipu.py | 186 ++++++++++++++++++ .../tests/unittests/ipu/test_mean_op_ipu.py | 109 ++++------ ...pipeline.py => test_model_pipeline_ipu.py} | 16 +- .../tests/unittests/ipu/test_mul_op_ipu.py | 112 +++++------ .../unittests/ipu/test_pool_avg_op_ipu.py | 84 ++++---- .../unittests/ipu/test_pool_max_op_ipu.py | 128 ++++++------ .../tests/unittests/ipu/test_pow_op_ipu.py | 140 +++++++------ .../tests/unittests/ipu/test_print_op_ipu.py | 143 ++++++++++++++ .../unittests/ipu/test_reduce_x_op_ipu.py | 124 ++++++------ .../ipu/test_reshape_inplace_op_ipu.py | 84 ++++---- .../unittests/ipu/test_reshape_op_ipu.py | 83 ++++---- ...est_save_load.py => test_save_load_ipu.py} | 105 +++++++--- .../tests/unittests/ipu/test_scale_op_ipu.py | 152 +++++++------- 13 files changed, 934 insertions(+), 532 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_model_pipeline.py => test_model_pipeline_ipu.py} (86%) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_save_load.py => test_save_load_ipu.py} (58%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py new file mode 100644 index 0000000000000..9f1c115403adf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_matmul_v2_op_ipu.py @@ -0,0 +1,186 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {"transpose_x": False, "transpose_y": False} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + y = paddle.static.data( + name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + + out = paddle.matmul(x, y, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = { + "transpose_x": True, + "transpose_y": True, + } + + +class TestCase3(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[5, 4, 2, 3]) + y = np.random.uniform(size=[5, 4, 3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase4(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[4, 3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase5(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[4, 2, 3]) + y = np.random.uniform(size=[3, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase6(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +@unittest.skip("not supported") +class TestCase6_2(TestCase6): + def set_data_feed(self): + x = np.random.uniform(size=[3]) + y = np.random.uniform(size=[3]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + def set_op_attrs(self): + self.attrs = {"transpose_x": True, "transpose_y": True} + + +class TestCase7(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 1]) + y = np.random.uniform(size=[1, 2]) + + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +@unittest.skip("dim > 4 is not supported") +class TestCase8(TestBase): + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[6, 5, 4, 2, 3]).astype('float32'), + "y": np.random.uniform(size=[6, 5, 4, 3, 2]).astype('float32'), + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py index f04d712755dea..b9dd7358b7955 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mean_op_ipu.py @@ -16,13 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,97 +26,79 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) + @property + def fp16_enabled(self): + return True - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - self.attrs['axis'] = None - self.attrs['keepdim'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') - out = paddle.mean(x, **self.attrs) - fetch_list = [out.name] + out = paddle.fluid.layers.mean(x) - if run_ipu: - place = paddle.IPUPlace() - else: + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() -class TestCase1(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 1 - self.attrs['keepdim'] = False - - -class TestCase2(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 2 - self.attrs['keepdim'] = False - - -class TestCase3(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = 2 - self.attrs['keepdim'] = True - - -class TestCase4(TestBase): - def set_attrs(self): - self.attrs = {} - self.attrs['axis'] = None - self.attrs['keepdim'] = True + self.check(output_dict) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py similarity index 86% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py rename to python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py index e1ed7603ed627..7e70239964002 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_model_pipeline.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_model_pipeline_ipu.py @@ -17,8 +17,7 @@ import numpy as np import unittest import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler +import paddle.static paddle.enable_static() SEED = 2021 @@ -28,7 +27,7 @@ "core is not compiled with IPU") class TestCastNet(unittest.TestCase): def _test(self, run_ipu=True): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = SEED @@ -37,14 +36,14 @@ def _test(self, run_ipu=True): np_image = np.random.rand(1, 3, 10, 10).astype(np.float32) - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): image = paddle.static.data( name='image', shape=[1, 3, 10, 10], dtype='float32') - with fluid.ipu_shard(ipu_index=0): + with paddle.static.ipu_shard_guard(index=0): conv1 = paddle.static.nn.conv2d( image, num_filters=3, filter_size=3, bias_attr=False) - with fluid.ipu_shard(ipu_index=1): + with paddle.static.ipu_shard_guard(index=1): conv2 = paddle.static.nn.conv2d( conv1, num_filters=3, filter_size=3, bias_attr=False) loss = paddle.mean(conv2) @@ -60,9 +59,10 @@ def _test(self, run_ipu=True): feed_list = [image.name] fetch_list = [loss.name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( num_ipus=2, is_training=False, enable_manual_shard=True) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_pipelining_config(enable_pipelining=False) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py index 78a2589d9aca5..7a9135626df79 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_mul_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,90 +26,98 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[2, 5]).astype('float32'), - "y": np.random.uniform(size=[5, 3]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.uniform(size=[2, 5]) + y = np.random.uniform(size=[5, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "x_num_col_dims": 1, "y_num_col_dims": 1, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.mul(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 2, 5]).astype('float32'), - "y": np.random.uniform(size=[5, 3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 2, 5]) + y = np.random.uniform(size=[5, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "x_num_col_dims": 2, "y_num_col_dims": 1, @@ -123,13 +125,13 @@ def set_attrs(self): class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 4, 2, 9]).astype('float32'), - "y": np.random.uniform(size=[3, 6, 1, 2, 3]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[3, 4, 2, 9]) + y = np.random.uniform(size=[3, 6, 1, 2, 3]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'x_num_col_dims': 2, 'y_num_col_dims': 2, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py index e81591ad68368..4288b82832ede 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_avg_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,23 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "pool_size": 3, "pool_type": 'avg', @@ -60,53 +56,59 @@ def set_attrs(self): "data_format": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pool2d(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py index a7c45c6686f10..911a163b8aa9c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pool_max_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,23 +26,25 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "pool_size": 3, "pool_type": 'max', @@ -60,120 +56,126 @@ def set_attrs(self): "data_format": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pool2d(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_size'] = 3 class TestCase1_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_size'] = [3, 1] class TestCase2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_stride'] = 2 class TestCase2_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_stride'] = [2, 1] class TestCase3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = [1, 1] class TestCase3_2(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = [1, 1, 2, 2] @unittest.skip('auto_pad is not currently supported') class TestCase3_3(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = 'VALID' @unittest.skip('auto_pad is not currently supported') class TestCase3_4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['pool_padding'] = 'SAME' class TestCase4(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['global_pooling'] = True class TestCase5(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['ceil_mode'] = True class TestCase6(TestBase): - def set_attrs(self): - super().set_attrs() + def set_op_attrs(self): + super().set_op_attrs() self.attrs['exclusive'] = False diff --git a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py index 5059de7ba77b1..b3562d722c4e6 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_pow_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,124 +26,146 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 2, 2]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"factor": 2.0} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.pow(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 2, 2]).astype('float32'), - "y": np.array([2.0]).astype('float32'), + def set_data_feed(self): + data1 = np.random.uniform(size=[1, 3, 2, 2]) + data2 = np.array([2.0]) + + self.feed_fp32 = { + "x": data1.astype(np.float32), + "y": data2.astype(np.float32) + } + self.feed_fp16 = { + "x": data1.astype(np.float16), + "y": data2.astype(np.float16) } - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') factor = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.pow(x, factor=factor, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py new file mode 100644 index 0000000000000..c9454e5945f7d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_data_feed(self): + self.feed = { + "x": np.random.uniform(size=[1, 3, 3, 3]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype=self.feed_dtype[0]) + out = paddle.fluid.layers.conv2d( + x, num_filters=3, filter_size=3) + out = paddle.fluid.layers.Print(out, **self.attrs) + + if self.is_training: + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + fetch_list = [loss.name] + else: + fetch_list = [out.name] + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + if self.is_training: + result = [] + for _ in range(self.epoch): + loss_res = exe.run(program, + feed=self.feed, + fetch_list=fetch_list) + result.append(loss_res[0]) + return np.array(result) + else: + result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + return result[0] + + def test(self): + res0 = self._test_base(False) + res1 = self._test_base(True) + + self.assertTrue( + np.allclose( + res0.flatten(), res1.flatten(), atol=self.atol)) + + self.assertTrue(res0.shape == res1.shape) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {"message": "input_data"} + + +class TestTrainCase1(TestBase): + def set_op_attrs(self): + # "forward" : print forward + # "backward" : print forward and backward + # "both": print forward and backward + self.attrs = {"message": "input_data2", "print_phase": "both"} + + def set_training(self): + self.is_training = True + self.epoch = 2 + + +@unittest.skip("attrs are not supported") +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = { + "first_n": 10, + "summarize": 10, + "print_tensor_name": True, + "print_tensor_type": True, + "print_tensor_shape": True, + "print_tensor_layout": True, + "print_tensor_lod": True + } + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py index ac8ad08e8b28c..929ee51b65094 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reduce_x_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,125 +26,137 @@ class TestMean(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.init_op() + self.set_test_op() + + @property + def fp16_enabled(self): + return True - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_mean def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = self.op(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def run_test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - def set_feed0(self): - self.feed = {} - self.feed["in_0"] = np.random.uniform(size=[2, 4]).astype(np.float32) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + def set_data_feed0(self): + data = np.random.uniform(size=[2, 4]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} self.set_feed_attr() - def set_feed1(self): - self.feed = {} - self.feed["in_0"] = np.random.uniform(size=[2, 2, 2]).astype(np.float32) + def set_data_feed1(self): + data = np.random.uniform(size=[2, 2, 2]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} self.set_feed_attr() - def set_attr0(self): + def set_op_attr0(self): self.attrs = {} self.attrs['dim'] = None self.attrs['keep_dim'] = False def test_case0(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.run_test_base() def test_case1(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = 0 self.run_test_base() def test_case2(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = -1 self.run_test_base() def test_case3(self): - self.set_feed0() - self.set_attr0() + self.set_data_feed0() + self.set_op_attr0() self.attrs['dim'] = 1 self.run_test_base() def test_case4(self): - self.set_feed0() + self.set_data_feed0() self.attrs = {} self.attrs['dim'] = 1 self.attrs['keep_dim'] = True self.run_test_base() def test_case5(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [1, 2] self.attrs['keep_dim'] = False self.run_test_base() def test_case6(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [0, 1] self.attrs['keep_dim'] = False self.run_test_base() def test_case7(self): - self.set_feed1() + self.set_data_feed1() self.attrs = {} self.attrs['dim'] = [0, 1] self.attrs['keep_dim'] = True @@ -158,22 +164,22 @@ def test_case7(self): class TestMax(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_max class TestMin(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_min class TestProd(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_prod class TestSum(TestMean): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.reduce_sum diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py index f312b7b69ad79..9ddf5c7537fdc 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_inplace_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,76 +26,84 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "shape": [30, 10], "inplace": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + add = paddle.fluid.layers.elementwise_add(x, x) out = paddle.fluid.layers.reshape(add, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): diff --git a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py index 5163838bc0cd6..119771931701c 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_reshape_op_ipu.py @@ -16,13 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -31,82 +26,92 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() - self.set_attrs() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([2, 4, 6]) + @property + def fp16_enabled(self): + return True - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[2, 4, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [6, 8] self.attrs['inplace'] = False - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32') + out = paddle.fluid.layers.reshape(x=x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict, check_shape=True) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [2, 3, -1, 2] self.attrs['inplace'] = False class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['shape'] = [-1, 0, 3, 2] self.attrs['inplace'] = False diff --git a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py similarity index 58% rename from python/paddle/fluid/tests/unittests/ipu/test_save_load.py rename to python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py index 24bb8e111842c..3a69487306208 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_save_load.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_save_load_ipu.py @@ -12,55 +12,52 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest -import shutil import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -paddle.enable_static() - @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): self.set_atol() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} - self.feed_list = list(self.feed.keys()) + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'sgd' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() def _test_base(self, save_otherwise_load): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = self.SEED startup_prog.random_seed = self.SEED - generator = fluid.unique_name.UniqueNameGenerator() + generator = paddle.fluid.unique_name.UniqueNameGenerator() - with fluid.unique_name.guard(generator): - with fluid.scope_guard(scope): + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -91,12 +88,17 @@ def _test_base(self, save_otherwise_load): exe.run(startup_prog) if not save_otherwise_load: - paddle.static.load(main_prog, "model/model") + paddle.static.load(main_prog, self.attrs['model_path'].name) ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( is_training=self.attrs['is_training']) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_precision_config( + enable_fp16=self.attrs['enable_fp16']) + ipu_strategy.set_options({ + 'save_per_n_step': self.attrs['save_at_step'] + }) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) @@ -104,16 +106,17 @@ def _test_base(self, save_otherwise_load): run_steps = self.attrs['steps'] if save_otherwise_load \ else self.attrs['steps'] - self.attrs['save_at_step'] + feed = self.feed_fp16 if self.attrs[ + 'enable_fp16'] else self.feed_fp32 for i in range(run_steps): - tmp = exe.run(program, - feed=self.feed, - fetch_list=fetch_list) + tmp = exe.run(program, feed=feed, fetch_list=fetch_list) # currently, we update opt state every sess.run, # will optimize if save_otherwise_load and \ i == self.attrs['save_at_step'] - 1: - paddle.static.save(main_prog, "model/model") + paddle.static.save(main_prog, + self.attrs['model_path'].name) if save_otherwise_load and i >= self.attrs['save_at_step']: result.append(tmp) @@ -129,25 +132,65 @@ def test_base(self): self.assertTrue( np.allclose( res0.flatten(), res1.flatten(), atol=self.atol)) - shutil.rmtree("model", True) + self.attrs['model_path'].cleanup() class TestAdam(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'adam' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() class TestLamb(TestBase): - def set_attrs(self): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'lamb' + self.attrs['enable_fp16'] = False + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestSGDFP16(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'sgd' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestAdamFP16(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'adam' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() + + +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") +class TestLambFP16(TestBase): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'lamb' + self.attrs['enable_fp16'] = True + self.attrs['model_path'] = tempfile.TemporaryDirectory() if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py index 6ad2a89a738b7..49714eba8d4d1 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_scale_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,80 +26,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return False + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": data.astype(np.float32)} + self.feed_fp16 = {"x": data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.0, "bias_after_scale": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.scale(x, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 5.0, "bias": 0.0, @@ -114,7 +116,7 @@ def set_attrs(self): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.5, @@ -123,7 +125,16 @@ def set_attrs(self): class TestCase3(TestBase): - def set_attrs(self): + def set_op_attrs(self): + self.attrs = { + "scale": 5.0, + "bias": 0.7, + "bias_after_scale": True, + } + + +class TestCase4(TestBase): + def set_op_attrs(self): self.attrs = { "scale": 1.0, "bias": 0.0, @@ -131,59 +142,66 @@ def set_attrs(self): } -class TestCase4(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[3, 3, 10, 10]).astype('float32'), - "y": np.array([3.0]).astype('float32'), - } +class TestCase5(TestBase): + def set_data_feed(self): + x = np.random.uniform(size=[3, 3, 10, 10]) + y = np.array([3.0]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "bias": 0.0, "bias_after_scale": True, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.scale(x, scale=y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] From fe765cb34e5a3970119f73472ab8cdd250924f11 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:23:12 +0800 Subject: [PATCH 44/50] [IPU] update ipu unittests p1 (#39923) * update ipu UTs part1 * rename ut * sync api changes * update uts for new api * update use_ipumodel() * update use_ipumodel() * split pr --- .../unittests/ipu/test_dropout_op_ipu.py | 88 +++++----- .../unittests/ipu/test_elemetwise_x_op_ipu.py | 150 +++++++++------- .../tests/unittests/ipu/test_equal_op_ipu.py | 114 +++++++------ .../tests/unittests/ipu/test_expand_op_ipu.py | 135 ++++++++------- .../ipu/test_fill_any_like_op_ipu.py | 111 ++++++++++++ .../ipu/test_fill_constant_op_ipu.py | 68 ++++---- .../ipu/test_fp16_inference_io_ipu.py | 160 ++++++++++++++++++ .../tests/unittests/ipu/test_gather_op_ipu.py | 97 +++++------ .../tests/unittests/ipu/test_gelu_op_ipu.py | 93 +++++----- .../unittests/ipu/test_greater_op_ipu.py | 140 +++++++++++++++ .../unittests/ipu/test_groupnorm_op_ipu.py | 112 ++++++------ ...l_io.py => test_inference_model_io_ipu.py} | 71 ++++---- .../unittests/ipu/test_instancenorm_op_ipu.py | 104 ++++++------ 13 files changed, 960 insertions(+), 483 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py rename python/paddle/fluid/tests/unittests/ipu/{test_ipu_inference_model_io.py => test_inference_model_io_ipu.py} (78%) diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py index 8b1560edfd81d..e34da7f70167a 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_dropout_op_ipu.py @@ -16,14 +16,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,81 +27,88 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'x': data.astype(np.float32)} + self.feed_fp16 = {'x': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.5, "is_test": True, "dropout_implementation": "downgrade_in_infer" } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + dropout = paddle.fluid.layers.dropout(x, **self.attrs) out = paddle.fluid.layers.elementwise_add(dropout, dropout) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.5, "is_test": True, @@ -115,7 +117,7 @@ def set_attrs(self): class TestCase2(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "dropout_prob": 0.0, "is_test": False, diff --git a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py index 07b06d77c90ff..a9d6d2308326e 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_elemetwise_x_op_ipu.py @@ -16,14 +16,9 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import (ExecutionMode, + IPUOpTest) @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,101 +27,136 @@ class TestMul(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.init_op() + self.set_test_op() + + @property + def fp16_enabled(self): + if IPUOpTest.use_ipumodel(): + return False + else: + return True - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_mul def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] - - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = self.op(x, y, **self.attrs) - fetch_list = [out.name] + fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def run_test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) def test_case0(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(2, 3, 4, 5)) + + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.attrs = {} self.set_feed_attr() self.run_test_base() def test_case1(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(3, 4)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(3, 4)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": 1} self.run_test_base() def test_case2(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(5)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(5)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": -1} self.run_test_base() def test_case3(self): - self.feed = { - "x": np.random.uniform(size=(2, 3, 4, 5)).astype('float32'), - "y": np.random.uniform(size=(2)).astype('float32'), + data_x = np.random.uniform(size=(2, 3, 4, 5)) + data_y = np.random.uniform(size=(2)) + self.feed_fp32 = { + "x": data_x.astype('float32'), + "y": data_y.astype('float32'), + } + self.feed_fp16 = { + "x": data_x.astype('float16'), + "y": data_y.astype('float16'), } self.set_feed_attr() self.attrs = {"axis": 0} @@ -134,37 +164,43 @@ def test_case3(self): class TestAdd(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_add class TestSub(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_sub class TestDiv(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_div class TestMin(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_min class TestMax(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_max class TestPow(TestMul): - def init_op(self): + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_pow class TestMod(TestMul): - def init_op(self): + def set_atol(self): + self.atol = 1e-7 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_test_op(self): self.op = paddle.fluid.layers.elementwise_mod diff --git a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py index c319894bfae25..5b18c73851324 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_equal_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,94 +26,106 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() - - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.zeros([1, 10]).astype('float32'), + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.zeros([1, 10]) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), } def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): - # XX x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='float32') + out = paddle.fluid.layers.equal(x, y, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(True) - res1 = self._test_base(False) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten().astype(np.int32) - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.ones([1, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.ones([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} class TestCase2(TestBase): - def set_feed(self): - self.feed = { - "x": np.ones([1, 10]).astype('float32'), - "y": np.arange(0, 10).reshape([1, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.arange(0, 10).reshape([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py index 5b7ea61568ecd..966dfdef87b54 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_expand_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,125 +26,142 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[2, 3, 1]).astype('float32')} + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"expand_times": [1, 2, 2]} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype="float32") + out = paddle.fluid.layers.expand(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = {"x": np.random.uniform(size=[2, 2]).astype('float32')} + def set_data_feed(self): + x = np.random.uniform(size=[2, 2]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + self.feed_dtype = [x.dtype for x in self.feed_fp32.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) - expand_times = fluid.layers.fill_constant( + dtype="float32") + + expand_times = paddle.fluid.layers.fill_constant( shape=[len(self.feed_shape[0])], dtype="int32", value=2) out = paddle.fluid.layers.expand( x, expand_times=expand_times, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py new file mode 100644 index 0000000000000..00b855a5a7a42 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_any_like_op_ipu.py @@ -0,0 +1,111 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 3, 1]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {'fill_value': 0.3, 'dtype': 'float32'} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + x_fill = paddle.full_like(x, **self.attrs) + out = paddle.fluid.layers.elementwise_add(x_fill, x_fill) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {'fill_value': 3, 'dtype': 'int32'} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py index c62e0c08f9c79..3a1c202bf1133 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_fill_constant_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,21 +26,23 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): + def set_data_feed(self): self.feed = {} def set_feed_attr(self): self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_dtype = [x.dtype for x in self.feed.values()] - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'name': 'x', 'shape': [1, 3, 3, 3], @@ -54,33 +50,34 @@ def set_attrs(self): 'value': 0.3, } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.fluid.layers.fill_constant(**self.attrs) out = paddle.fluid.layers.elementwise_add(x, x) - fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -89,19 +86,18 @@ def _test_base(self, run_ipu=True): result = exe.run(program, feed=self.feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) - - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { 'name': 'x', 'shape': [1, 3, 3, 3], diff --git a/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py new file mode 100644 index 0000000000000..cd29ff705b88f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_fp16_inference_io_ipu.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import shutil + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['steps'] = 100 + self.attrs['save_at_step'] = 20 + self.attrs['is_training'] = True + self.attrs['opt_type'] = 'sgd' + self.attrs['path'] = 'model' + self.attrs['model_name'] = 'test' + + def _test_save(self): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + generator = paddle.fluid.unique_name.UniqueNameGenerator() + self.full_name = '/'.join( + [self.attrs['path'], self.attrs['model_name']]) + + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + scale = paddle.fluid.layers.scale( + x, scale=1.0, bias=0.0, bias_after_scale=True) + conv = paddle.static.nn.conv2d( + scale, + num_filters=3, + filter_size=3, + bias_attr=False, + name='conv2d') + loss = paddle.mean(conv) + + if self.attrs['is_training']: + if self.attrs['opt_type'] == 'sgd': + sgd = paddle.optimizer.SGD(learning_rate=1e-2) + sgd.minimize(loss) + elif self.attrs['opt_type'] == 'adam': + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) + elif self.attrs['opt_type'] == 'lamb': + lamb = paddle.optimizer.Lamb(learning_rate=1e-2) + lamb.minimize(loss) + + fetch_list = [loss.name] + + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=True) + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile( + self.feed_list, fetch_list) + + for _ in range(self.attrs['steps']): + exe.run(program, feed=self.feed_fp16, fetch_list=fetch_list) + + paddle.static.save_inference_model( + self.full_name, x, loss, exe, program=program.org_program) + + def _test_load(self, run_ipu): + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + [inference_program, feed_target_names, fetch_targets] = ( + paddle.static.load_inference_model(self.full_name, exe)) + + if run_ipu: + feed_list = feed_target_names + fetch_list = [fetch_targets[0].name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=False) + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + inference_program, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = inference_program + + feed = self.feed_fp16 if run_ipu else self.feed_fp32 + result = [] + for i in range(10): + feed["in_0"] += np.array([1.1 * i]).astype(feed["in_0"].dtype) + out = exe.run(program, feed=feed, fetch_list=[fetch_targets]) + result.append(out) + + return np.array(result) + + def test_base(self): + self._test_save() + cpu_res = self._test_load(False) + ipu_res = self._test_load(True).astype(np.float32) + + self.assertTrue( + np.allclose( + cpu_res, ipu_res, rtol=self.rtol_fp16, atol=self.atol_fp16)) + + shutil.rmtree(self.attrs['path'], True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py index d5be8ae0cf775..01a56fd14be04 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gather_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,85 +26,92 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[10, 20]).astype('float32'), - "y": np.array([1, 3, 5]).astype('int32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[10, 20]) + y = np.array([1, 3, 5]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') y = paddle.static.data( name=self.feed_list[1], shape=self.feed_shape[1], - dtype=self.feed_dtype[1]) + dtype='int32') + out = paddle.fluid.layers.gather(x, index=y, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[100]).astype('float32'), - "y": np.array([1, 3, 5]).astype('int32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[100]) + y = np.array([1, 3, 5]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.int32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.int32)} if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py index ca8c0935d782c..602289f3f1904 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,80 +26,89 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_atol(self): - self.atol = 1e-3 + @property + def fp16_enabled(self): + return True - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32') - } + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"approximate": False} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') + out = paddle.fluid.layers.gelu(x, **self.attrs) fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) - return result[0] + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) -@unittest.skip('approximate=True is not supported') class TestCase1(TestBase): - def set_attrs(self): + def set_atol(self): + self.atol = 1e-10 + self.rtol = 1e-6 + self.atol_fp16 = 2e-3 + self.rtol_fp16 = 1e-3 + + def set_op_attrs(self): self.attrs = {"approximate": True} diff --git a/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py new file mode 100644 index 0000000000000..05a37dcb3d514 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_greater_op_ipu.py @@ -0,0 +1,140 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + x = np.random.randn(3, 4, 5) + y = np.random.randn(3, 4, 5) + self.feed_fp32 = { + "x": x.astype(np.float32), + "y": y.astype(np.float32), + } + self.feed_fp16 = { + "x": x.astype(np.float16), + "y": y.astype(np.float16), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + y = paddle.static.data( + name=self.feed_list[1], + shape=self.feed_shape[1], + dtype='float32') + + out = paddle.fluid.layers.greater_than(x, y, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode).flatten().astype(np.int32) + + self.check(output_dict) + + +class TestCase1(TestBase): + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.ones([10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase2(TestBase): + def set_data_feed(self): + x = np.ones([1, 10]) + y = np.zeros([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +class TestCase3(TestBase): + def set_data_feed(self): + x = np.zeros([1, 10]) + y = np.ones([1, 10]) + self.feed_fp32 = {"x": x.astype(np.float32), "y": y.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16), "y": y.astype(np.float16)} + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py index eb644c2c6670f..102e764cb2f17 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_groupnorm_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,43 +26,49 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 8, 10, 10]).astype('float32'), - } + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 3e-6 + self.rtol = 1e-6 + self.atol_fp16 = 4e-3 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 8, 10, 10]) + self.feed_fp32 = {'in_0': data.astype(np.float32)} + self.feed_fp16 = {'in_0': data.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 8, "epsilon": 1e-05, "data_layout": 'NCHW', } - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -78,62 +78,68 @@ def _test_base(self, run_ipu=True): bias = paddle.ParamAttr(trainable=True) out = paddle.fluid.layers.nn.group_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) else: - scale = True - bias = True out = paddle.fluid.layers.nn.group_norm( - x, param_attr=scale, bias_attr=bias, **self.attrs) + x, param_attr=True, bias_attr=True, **self.attrs) if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) fetch_list = [loss.name] else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res[0]) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + if mode > ExecutionMode.IPU_FP32 and self.is_training: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestCase1(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 4, "epsilon": 1e-05, @@ -147,11 +153,15 @@ def set_training(self): self.epoch = 10 +@unittest.skipIf(IPUOpTest.use_ipumodel(), "skip for ipumodel") class TestTrainCase2(TestBase): def set_atol(self): - self.atol = 1e-3 + self.atol = 7e-4 + self.rtol = 1e-6 + self.atol_fp16 = 4e-3 + self.rtol_fp16 = 1e-3 - def set_attrs(self): + def set_op_attrs(self): self.attrs = { "groups": 4, "epsilon": 1e-05, @@ -163,7 +173,5 @@ def set_training(self): self.epoch = 10 -# not support `group_norm(x, param_attr=False, bias_attr=False, **self.attrs)` - if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py similarity index 78% rename from python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py rename to python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py index 0a331d804545d..33a63a80e3bc0 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_inference_model_io.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_inference_model_io_ipu.py @@ -12,59 +12,59 @@ # See the License for the specific language governing permissions and # limitations under the License. +import tempfile import unittest -import shutil import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest -paddle.enable_static() - @unittest.skipIf(not paddle.is_compiled_with_ipu(), "core is not compiled with IPU") class TestBase(IPUOpTest): def setUp(self): self.set_atol() - self.set_feed() - self.set_attrs() - - def set_feed(self): - self.feed_shape = [] - self.feed_shape.append([1, 3, 10, 10]) - - self.feed = {} - self.feed["in_0"] = np.random.uniform( - size=self.feed_shape[0]).astype(np.float32) - + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 + + def set_data_feed(self): + data = np.random.uniform(size=[1, 3, 10, 10]) + self.feed = {"in_0": data.astype(np.float32)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] self.feed_list = list(self.feed.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['save_at_step'] = 20 self.attrs['is_training'] = True self.attrs['opt_type'] = 'sgd' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' def _test_save(self): - scope = fluid.core.Scope() + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() main_prog.random_seed = self.SEED startup_prog.random_seed = self.SEED - generator = fluid.unique_name.UniqueNameGenerator() + generator = paddle.fluid.unique_name.UniqueNameGenerator() self.full_name = '/'.join( - [self.attrs['path'], self.attrs['model_name']]) + [self.attrs['path'].name, self.attrs['model_name']]) - with fluid.unique_name.guard(generator): - with fluid.scope_guard(scope): + with paddle.fluid.unique_name.guard(generator): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], @@ -88,16 +88,16 @@ def _test_save(self): elif self.attrs['opt_type'] == 'lamb': lamb = paddle.optimizer.Lamb(learning_rate=1e-2) lamb.minimize(loss) - fetch_list = [loss.name] + fetch_list = [loss.name] place = paddle.IPUPlace() exe = paddle.static.Executor(place) exe.run(startup_prog) ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig( + ipu_strategy.set_graph_config( is_training=self.attrs['is_training']) - program = compiler.IPUCompiledProgram( + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile( self.feed_list, fetch_list) @@ -125,8 +125,8 @@ def _test_load(self, run_ipu): feed_list = feed_target_names fetch_list = [fetch_targets[0].name] ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=False) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=False) + program = paddle.static.IpuCompiledProgram( inference_program, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: @@ -134,7 +134,7 @@ def _test_load(self, run_ipu): tmp = exe.run(program, feed=self.feed, fetch_list=[fetch_targets]) - return tmp + return np.array(tmp) def test_base(self): self._test_save() @@ -142,27 +142,26 @@ def test_base(self): ipu_res = self._test_load(True) self.assertTrue(np.allclose(cpu_res, ipu_res, atol=self.atol)) - - shutil.rmtree(self.attrs['path'], True) + self.attrs['path'].cleanup() class TestAdam(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['is_training'] = True self.attrs['opt_type'] = 'adam' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' class TestLamb(TestBase): - def set_attrs(self): + def set_op_attrs(self): self.attrs = {} self.attrs['steps'] = 100 self.attrs['is_training'] = True self.attrs['opt_type'] = 'lamb' - self.attrs['path'] = 'model' + self.attrs['path'] = tempfile.TemporaryDirectory() self.attrs['model_name'] = 'test' diff --git a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py index ee9cd875cf298..ed8f3950ace82 100644 --- a/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py +++ b/python/paddle/fluid/tests/unittests/ipu/test_instancenorm_op_ipu.py @@ -16,14 +16,8 @@ import numpy as np import paddle -import paddle.fluid as fluid -import paddle.fluid.compiler as compiler -import paddle.optimizer import paddle.static -from paddle.fluid.tests.unittests.ipu.op_test_ipu import (IPUOpTest, - np_dtype_to_fluid_str) - -paddle.enable_static() +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode @unittest.skipIf(not paddle.is_compiled_with_ipu(), @@ -32,39 +26,45 @@ class TestBase(IPUOpTest): def setUp(self): self.set_atol() self.set_training() - self.set_feed() + self.set_data_feed() self.set_feed_attr() - self.set_attrs() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_atol(self): + self.atol = 1e-6 + self.rtol = 1e-5 + self.atol_fp16 = 1e-2 + self.rtol_fp16 = 1e-3 - def set_feed(self): - self.feed = { - "x": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), - } + def set_data_feed(self): + x = np.random.uniform(size=[1, 3, 10, 10]) + self.feed_fp32 = {"x": x.astype(np.float32)} + self.feed_fp16 = {"x": x.astype(np.float16)} def set_feed_attr(self): - self.feed_shape = [x.shape for x in self.feed.values()] - self.feed_list = list(self.feed.keys()) - self.feed_dtype = [ - np_dtype_to_fluid_str(x.dtype) for x in self.feed.values() - ] + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) - def set_attrs(self): + def set_op_attrs(self): self.attrs = {"epsilon": 1e-05} - def _test_base(self, run_ipu=True): - scope = fluid.core.Scope() + def _test_base(self, exec_mode): + scope = paddle.static.Scope() main_prog = paddle.static.Program() startup_prog = paddle.static.Program() - SEED = self.SEED - main_prog.random_seed = SEED - startup_prog.random_seed = SEED + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED - with fluid.scope_guard(scope): + with paddle.static.scope_guard(scope): with paddle.static.program_guard(main_prog, startup_prog): x = paddle.static.data( name=self.feed_list[0], shape=self.feed_shape[0], - dtype=self.feed_dtype[0]) + dtype='float32') if self.is_training: ch = self.feed_shape[0][1] @@ -74,58 +74,64 @@ def _test_base(self, run_ipu=True): bias = paddle.ParamAttr(trainable=True) out = paddle.fluid.layers.nn.instance_norm( conv1, param_attr=scale, bias_attr=bias, **self.attrs) + loss = paddle.mean(out) + adam = paddle.optimizer.Adam(learning_rate=1e-2) + adam.minimize(loss) else: - scale = True - bias = True out = paddle.fluid.layers.nn.instance_norm( - x, param_attr=scale, bias_attr=bias, **self.attrs) + x, param_attr=True, bias_attr=True, **self.attrs) if self.is_training: - loss = paddle.mean(out) - adam = paddle.optimizer.Adam(learning_rate=1e-2) - adam.minimize(loss) fetch_list = [loss.name] else: fetch_list = [out.name] - if run_ipu: - place = paddle.IPUPlace() - else: + if exec_mode == ExecutionMode.CPU_FP32: place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + exe = paddle.static.Executor(place) exe.run(startup_prog) - if run_ipu: + if exec_mode != ExecutionMode.CPU_FP32: feed_list = self.feed_list ipu_strategy = paddle.static.IpuStrategy() - ipu_strategy.SetGraphConfig(is_training=self.is_training) - program = compiler.IPUCompiledProgram( + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) else: program = main_prog + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + if self.is_training: result = [] for _ in range(self.epoch): loss_res = exe.run(program, - feed=self.feed, + feed=feed, fetch_list=fetch_list) result.append(loss_res) return np.array(result) else: - result = exe.run(program, feed=self.feed, fetch_list=fetch_list) + result = exe.run(program, feed=feed, fetch_list=fetch_list) return result[0] - def test_base(self): - res0 = self._test_base(False) - res1 = self._test_base(True) + def test(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + if mode > ExecutionMode.IPU_FP32 and self.is_training: + break + output_dict[mode] = self._test_base(mode).flatten() - self.assertTrue( - np.allclose( - res0.flatten(), res1.flatten(), atol=self.atol)) - - self.assertTrue(res0.shape == res1.shape) + self.check(output_dict) class TestTrainCase1(TestBase): @@ -134,7 +140,5 @@ def set_training(self): self.epoch = 10 -# not support `instance_norm(x, param_attr=False, bias_attr=False, **self.attrs)` - if __name__ == "__main__": unittest.main() From 0b597754e27113129e9969e6be8d2a588def032e Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Wed, 9 Mar 2022 11:39:41 +0800 Subject: [PATCH 45/50] add ipu uts (#40205) --- .../unittests/ipu/test_flatten_op_ipu.py | 118 +++++++++++++ .../tests/unittests/ipu/test_optimizer_ipu.py | 165 ++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py create mode 100644 python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py diff --git a/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py new file mode 100644 index 0000000000000..6f0cafc66805e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_flatten_op_ipu.py @@ -0,0 +1,118 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest, ExecutionMode + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_training() + self.set_data_feed() + self.set_feed_attr() + self.set_op_attrs() + + @property + def fp16_enabled(self): + return True + + def set_data_feed(self): + data = np.random.uniform(size=[2, 2, 4, 6]) + self.feed_fp32 = {"in_0": data.astype(np.float32)} + self.feed_fp16 = {"in_0": data.astype(np.float16)} + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed_fp32.values()] + self.feed_list = list(self.feed_fp32.keys()) + + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 1 + + def _test_base(self, exec_mode): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + x = paddle.static.data( + name=self.feed_list[0], + shape=self.feed_shape[0], + dtype='float32') + + out = paddle.fluid.layers.flatten(x=x, **self.attrs) + + fetch_list = [out.name] + + if exec_mode == ExecutionMode.CPU_FP32: + place = paddle.CPUPlace() + else: + place = paddle.IPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if exec_mode != ExecutionMode.CPU_FP32: + feed_list = self.feed_list + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=self.is_training) + if exec_mode == ExecutionMode.IPU_POPART_FP16: + ipu_strategy.set_precision_config(enable_fp16=True) + program = paddle.static.IpuCompiledProgram( + main_prog, + ipu_strategy=ipu_strategy).compile(feed_list, fetch_list) + else: + program = main_prog + + feed = self.feed_fp32 + if exec_mode > ExecutionMode.IPU_FP32: + feed = self.feed_fp16 + + result = exe.run(program, feed=feed, fetch_list=fetch_list) + return result[0] + + def test_base(self): + output_dict = {} + for mode in ExecutionMode: + if mode > ExecutionMode.IPU_FP32 and not self.fp16_enabled: + break + output_dict[mode] = self._test_base(mode) + + self.check(output_dict, check_shape=True) + + +class TestCase1(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 0 + + +class TestCase2(TestBase): + def set_op_attrs(self): + self.attrs = {} + self.attrs['axis'] = 2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py new file mode 100644 index 0000000000000..1cc10da3d7344 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ipu/test_optimizer_ipu.py @@ -0,0 +1,165 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import paddle +import paddle.static +from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest + + +@unittest.skipIf(not paddle.is_compiled_with_ipu(), + "core is not compiled with IPU") +class TestBase(IPUOpTest): + def setUp(self): + self.set_atol() + self.set_data_feed() + self.set_feed_attr() + self.set_attrs() + + def set_atol(self): + self.atol = 1e-6 + + def set_data_feed(self): + self.feed = { + "image": np.random.uniform(size=[1, 3, 10, 10]).astype('float32'), + } + + def set_feed_attr(self): + self.feed_shape = [x.shape for x in self.feed.values()] + self.feed_list = list(self.feed.keys()) + self.feed_dtype = [x.dtype for x in self.feed.values()] + + def set_attrs(self): + self.attrs = { + "optimizer": 'sgd', + "weight_decay": 0.0, + "loss_scaling": 1.0, + } + + def _test_optimizer(self, run_ipu=True): + scope = paddle.static.Scope() + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = self.SEED + startup_prog.random_seed = self.SEED + np.random.seed(self.SEED) + + with paddle.static.scope_guard(scope): + with paddle.static.program_guard(main_prog, startup_prog): + image = paddle.static.data( + name='image', shape=[1, 3, 10, 10], dtype='float32') + conv1 = paddle.static.nn.conv2d( + image, num_filters=3, filter_size=3, bias_attr=False) + loss = paddle.mean(conv1) + + weight_decay = self.attrs['weight_decay'] + opt = paddle.optimizer.SGD(learning_rate=1e-1, + weight_decay=weight_decay) + if self.attrs['optimizer'] == 'adam': + opt = paddle.optimizer.Adam( + learning_rate=1e-1, weight_decay=weight_decay) + elif self.attrs['optimizer'] == 'lamb': + + opt = paddle.optimizer.Lamb( + learning_rate=1e-1, lamb_weight_decay=weight_decay) + opt.minimize(loss) + + if run_ipu: + place = paddle.IPUPlace() + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + if run_ipu: + feed_list = [image.name] + fetch_list = [loss.name] + ipu_strategy = paddle.static.IpuStrategy() + ipu_strategy.set_graph_config(is_training=True) + ipu_strategy.loss_scaling = self.attrs["loss_scaling"] + program = paddle.static.IpuCompiledProgram( + main_prog, ipu_strategy=ipu_strategy).compile(feed_list, + fetch_list) + else: + program = main_prog + + result = [] + for epoch in range(100): + loss_res = exe.run(program, feed=self.feed, fetch_list=[loss]) + result.append(loss_res) + + return np.array(result) + + def test(self): + # cpu and ipu dimenstion mismatch, cpu:(100, 1, 1), ipu:(100, 1) + ipu_loss = self._test_optimizer(True).flatten() + cpu_loss = self._test_optimizer(False).flatten() + + self.assertTrue(np.allclose(ipu_loss, cpu_loss, atol=self.atol)) + + +@unittest.skip('do not support L2 regularization') +class TestSGD(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'sgd', + "weight_decay": 0.1, + "loss_scaling": 2.0, + } + + +@unittest.skip('do not support L2 regularization') +class TestAdamCase1(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.1, + "loss_scaling": 3.0, + } + + +class TestAdamCase2(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'adam', + "weight_decay": 0.0, + "loss_scaling": 4.0, + } + + +@unittest.skip('seems cpu output wrong') +class TestLambCase1(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.0, + "loss_scaling": 5.0, + } + + +@unittest.skip('seems cpu output wrong') +class TestLamb(TestBase): + def set_attrs(self): + self.attrs = { + "optimizer": 'lamb', + "weight_decay": 0.1, + "loss_scaling": 6.0, + } + + +if __name__ == "__main__": + unittest.main() From 2037fa68db8a79ff4869afcf0ce6864d7e05449f Mon Sep 17 00:00:00 2001 From: xiongkun Date: Wed, 9 Mar 2022 11:49:44 +0800 Subject: [PATCH 46/50] [optest]: fix transpose, support different parameter name between python_api and KernelSignature. (#40258) * optest: fix transpose * fix --- .../paddle/fluid/tests/unittests/op_test.py | 75 ++++++++++++++----- 1 file changed, 57 insertions(+), 18 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 6455da924757b..457f20ac5b06b 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -50,6 +50,7 @@ no_check_set_white_list, op_threshold_white_list, no_grad_set_white_list, ) +from paddle.fluid.dygraph.dygraph_to_static.utils import parse_arg_and_kwargs def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs): @@ -698,19 +699,55 @@ def _check_api_outs_by_dygraph_outs(self, api_outs, dygraph_outs, place): self.__class__.__name__) def _calc_python_api_output(self, place): - def prepare_python_api_arguments(op_proto_ins, op_proto_attrs, + def prepare_python_api_arguments(api, op_proto_ins, op_proto_attrs, kernel_sig): """ map from `op proto inputs and attrs` to `api input list and api attrs dict` """ + + class Empty: + pass + + def is_empty(a): + return isinstance(a, Empty) + + def get_default(idx, all_params_number, defaults): + related_idx = idx - all_params_number + len(defaults) + assert related_idx >= 0, "%d-th arguments don't have default value" % idx + return defaults[related_idx] + + def remove_name(x): + if isinstance(x, list): return [i for i in x if i != 'name'] + if isinstance(x, dict): + return {k: v for k, v in x.items() if k != 'name'} + assert False, "Only support list or dict." + + def to_defaults_list(params, defaults): + return [defaults[p] for p in params if p in defaults] + # NOTE(xiongkun): why don't use input arguments dicts ? # Because we don't know the python api name of each arguments. + # using parse_arg_and_kwargs, we can get the all api information we need. + api_params, api_defaults = [ + remove_name(item) for item in parse_arg_and_kwargs(api) + ] + api_defaults = to_defaults_list(api_params, api_defaults) inputs_sig, attrs_sig, outputs_sig = kernel_sig - input_arguments = [op_proto_ins[name] for name in inputs_sig] - attr_arguments = { - name: op_proto_attrs[name] - for name in attrs_sig if name in op_proto_attrs - } - return input_arguments, attr_arguments + inputs_and_attrs = inputs_sig + attrs_sig + assert ( + len(api_params) == len(inputs_and_attrs) + ), "inputs and attrs length must equals to python api length. (May be output is in argument list?)" + input_arguments = [op_proto_ins[name] for name in inputs_sig] + [ + op_proto_attrs[name] if name in op_proto_attrs else Empty() + for name in attrs_sig + ] + results = [] + for idx, arg in enumerate(input_arguments): + if is_empty(arg): + results.append( + get_default(idx, len(input_arguments), api_defaults)) + else: + results.append(arg) + return results def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): if not isinstance(ret_tuple, (tuple, list)): @@ -720,25 +757,27 @@ def construct_output_dict_by_kernel_sig(ret_tuple, output_sig): len(output_sig), len(ret_tuple)) return {a: b for a, b in zip(output_sig, ret_tuple)} - def assumption_assert_and_transform(args, argvs): + def assumption_assert_and_transform(args, inp_num): """ - transform by the following rules: + transform inputs by the following rules: 1. [Tensor] -> Tensor 2. [Tensor, Tensor, ...] -> list of Tensors only support "X" is list of Tensor, currently don't support other structure like dict. """ - for inp in args: + for inp in args[:inp_num]: assert isinstance( inp, list ), "currently only support `X` is [Tensor], don't support other structure." - args = [inp[0] if len(inp) == 1 else inp for inp in args] - return args, argvs + args = [ + inp[0] if len(inp) == 1 else inp for inp in args[:inp_num] + ] + args[inp_num:] + return args - def cal_python_api(python_api, args, argvs, kernel_sig): - args, argvs = assumption_assert_and_transform(args, argvs) + def cal_python_api(python_api, args, kernel_sig): inputs_sig, attrs_sig, outputs_sig = kernel_sig - ret_tuple = python_api(*args, **argvs) + args = assumption_assert_and_transform(args, len(inputs_sig)) + ret_tuple = python_api(*args) return construct_output_dict_by_kernel_sig(ret_tuple, outputs_sig) with fluid.dygraph.base.guard(place=place): @@ -764,11 +803,11 @@ def cal_python_api(python_api, args, argvs, kernel_sig): assert hasattr( self, "python_api" ), "Please set the `self.python_api` if you want to compare python api output." - arg, argv = prepare_python_api_arguments(inputs, attrs_outputs, - kernel_sig) + args = prepare_python_api_arguments(self.python_api, inputs, + attrs_outputs, kernel_sig) """ we directly return the cal_python_api value because the value is already tensor. """ - return cal_python_api(self.python_api, arg, argv, kernel_sig) + return cal_python_api(self.python_api, args, kernel_sig) def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): self.__class__.op_type = self.op_type # for ci check, please not delete it for now From 9968c56321a74c51fb762cb583f80bac6de90e6f Mon Sep 17 00:00:00 2001 From: chenenquan Date: Wed, 9 Mar 2022 11:53:36 +0800 Subject: [PATCH 47/50] [Phi] Migrate linspace op to phi (#40124) * [Phi] Migrate linspace op * [Phi] Migrate linspace op * [Phi] Fix linspace op * [PHI] rename data_tranform to data_type_transform * [PHI] Fix DECLARE and PD --- paddle/fluid/operators/linspace_op.cc | 45 ++------ paddle/fluid/operators/linspace_op.cu | 104 ------------------ paddle/fluid/operators/linspace_op.h | 76 ------------- paddle/phi/infermeta/ternary.cc | 29 +++++ paddle/phi/infermeta/ternary.h | 5 + paddle/phi/kernels/cpu/linspace_kernel.cc | 71 ++++++++++++ .../phi/kernels/funcs/data_type_transform.h | 58 ++++++++++ paddle/phi/kernels/gpu/linspace_kernel.cu | 97 ++++++++++++++++ paddle/phi/kernels/linspace_kernel.h | 26 +++++ 9 files changed, 298 insertions(+), 213 deletions(-) delete mode 100644 paddle/fluid/operators/linspace_op.cu delete mode 100644 paddle/fluid/operators/linspace_op.h create mode 100644 paddle/phi/kernels/cpu/linspace_kernel.cc create mode 100644 paddle/phi/kernels/funcs/data_type_transform.h create mode 100644 paddle/phi/kernels/gpu/linspace_kernel.cu create mode 100644 paddle/phi/kernels/linspace_kernel.h diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e893a..378c7573d6129 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9df..0000000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc09..0000000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index c3472a24801fd..eb807ad461511 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -209,4 +209,33 @@ void LerpInferMeta(const MetaTensor& x, out->share_lod(x); } +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out) { + auto s_dims = start.dims(); + PADDLE_ENFORCE_EQ( + (s_dims.size() == 1) && (s_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Start) must be [1]," + "but received input shape is [%s].", + s_dims)); + auto e_dims = stop.dims(); + PADDLE_ENFORCE_EQ( + (e_dims.size() == 1) && (e_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Stop) must be [1]," + "but received input shape is [%s].", + e_dims)); + auto step_dims = number.dims(); + PADDLE_ENFORCE_EQ( + (step_dims.size() == 1) && (step_dims[0] == 1), + true, + phi::errors::InvalidArgument("The shape of Input(Num) must be [1]," + "but received input shape is [%s].", + step_dims)); + out->set_dims(phi::make_ddim({-1})); + out->set_dtype(start.dtype()); +} + } // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index cff57e1ba7078..4dec14425166f 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -58,4 +58,9 @@ void LerpInferMeta(const MetaTensor& x, const MetaTensor& weight, MetaTensor* out); +void LinspaceInferMeta(const MetaTensor& start, + const MetaTensor& stop, + const MetaTensor& number, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/linspace_kernel.cc b/paddle/phi/kernels/cpu/linspace_kernel.cc new file mode 100644 index 0000000000000..4b8b7f7a2e05c --- /dev/null +++ b/paddle/phi/kernels/cpu/linspace_kernel.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/linspace_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" + +namespace phi { + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out) { + int32_t num = number.data()[0]; + auto start_t = phi::funcs::TransDataType(ctx, start, dtype); + auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype); + + T start_data = start_t.template data()[0]; + T stop_data = stop_t.template data()[0]; + PADDLE_ENFORCE_GT( + num, + 0, + phi::errors::InvalidArgument("The num of linspace op should be larger " + "than 0, but received num is %d", + num)); + + out->Resize(phi::make_ddim({num})); + T* out_data = ctx.template Alloc(out); + + if (num > 1) { + // step should be of double type for all types + double step = (static_cast(stop_data - start_data)) / (num - 1); + int half_num = num / 2; + for (int i = 0; i < num; ++i) { + if (i < half_num) { + out_data[i] = static_cast(start_data + step * i); + } else { + out_data[i] = static_cast(stop_data - step * (num - i - 1)); + } + } + } else { + out_data[0] = static_cast(start_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(linspace, + CPU, + ALL_LAYOUT, + phi::LinspaceKernel, + float, + int32_t, + int64_t, + double) {} diff --git a/paddle/phi/kernels/funcs/data_type_transform.h b/paddle/phi/kernels/funcs/data_type_transform.h new file mode 100644 index 0000000000000..ad7f2aa192ce4 --- /dev/null +++ b/paddle/phi/kernels/funcs/data_type_transform.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { +namespace funcs { + +template +phi::DenseTensor TransDataType(const Context& dev_ctx, + const phi::DenseTensor& x, + DataType dtype) { + VLOG(3) << "TransDataType " + << "src type:" << x.dtype() << "; dst typoe: " << dtype; + + switch (x.dtype()) { + case DataType::FLOAT32: + return phi::Cast(dev_ctx, x, dtype); + case DataType::FLOAT64: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT32: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT64: + return phi::Cast(dev_ctx, x, dtype); + case DataType::FLOAT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::BFLOAT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::BOOL: + return phi::Cast(dev_ctx, x, dtype); + case DataType::INT16: + return phi::Cast(dev_ctx, x, dtype); + case DataType::UINT8: + return phi::Cast(dev_ctx, x, dtype); + default: + PADDLE_THROW(phi::errors::Unimplemented( + "Data type (%s) is not supported when casting data type.", + x.dtype())); + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/linspace_kernel.cu b/paddle/phi/kernels/gpu/linspace_kernel.cu new file mode 100644 index 0000000000000..3a6ff365c11db --- /dev/null +++ b/paddle/phi/kernels/gpu/linspace_kernel.cu @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/linspace_kernel.h" + +#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/copy_kernel.h" +#include "paddle/phi/kernels/funcs/data_type_transform.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +__global__ void LinspaceKernelInner( + T start, T stop, double step, int64_t size, T* out) { + int64_t index = blockIdx.x * blockDim.x + threadIdx.x; + + for (; index < size; index += blockDim.x * gridDim.x) { + if (index < size / 2) { + out[index] = static_cast(start + step * index); + } else { + out[index] = static_cast(stop - step * (size - index - 1)); + } + } +} + +template +__global__ void LinspaceSpecialKernel(T start, T* out) { + out[0] = static_cast(start); +} + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out) { + auto start_t = phi::funcs::TransDataType(ctx, start, dtype); + auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype); + + DenseTensor n_start; + DenseTensor n_stop; + DenseTensor n_num; + phi::Copy(ctx, start_t, phi::CPUPlace(), false, &n_start); + T start_data = n_start.data()[0]; + phi::Copy(ctx, stop_t, phi::CPUPlace(), false, &n_stop); + T stop_data = n_stop.data()[0]; + phi::Copy(ctx, number, phi::CPUPlace(), false, &n_num); + int64_t num = static_cast(n_num.data()[0]); + + PADDLE_ENFORCE_GT( + num, + 0, + phi::errors::InvalidArgument("The num of linspace op should be larger " + "than 0, but received num is %d", + num)); + + out->Resize(phi::make_ddim({num})); + T* out_data = ctx.template Alloc(out); + + double step = 0; + auto stream = ctx.stream(); + int block = 512; + int grid = (num + block - 1) / block; + if (num != 1) { + step = (static_cast(stop_data - start_data)) / (num - 1); + LinspaceKernelInner<<>>( + start_data, stop_data, step, num, out_data); + } else { + LinspaceSpecialKernel<<>>(start_data, out_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(linspace, + GPU, + ALL_LAYOUT, + phi::LinspaceKernel, + float, + int32_t, + int64_t, + double) {} diff --git a/paddle/phi/kernels/linspace_kernel.h b/paddle/phi/kernels/linspace_kernel.h new file mode 100644 index 0000000000000..ca2b940aef965 --- /dev/null +++ b/paddle/phi/kernels/linspace_kernel.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void LinspaceKernel(const Context& ctx, + const DenseTensor& start, + const DenseTensor& stop, + const DenseTensor& number, + DataType dtype, + DenseTensor* out); + +} // namespace phi From 05ff6cc52d309ccfba217225f62b1bc427d626e2 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Wed, 9 Mar 2022 12:18:38 +0800 Subject: [PATCH 48/50] bypass eager mode (#40245) --- .../paddle/fluid/tests/unittests/test_function_hook.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_function_hook.py b/python/paddle/fluid/tests/unittests/test_function_hook.py index d45ef528261f3..55981b01c4084 100644 --- a/python/paddle/fluid/tests/unittests/test_function_hook.py +++ b/python/paddle/fluid/tests/unittests/test_function_hook.py @@ -20,6 +20,7 @@ import paddle.fluid.core as core from paddle import _C_ops +from paddle.fluid.framework import _test_eager_guard class TestCapture: @@ -41,7 +42,7 @@ def grad_hook(grad): class TestBakcwardFunctionHookError(unittest.TestCase): - def test_hook(self): + def func_hook(self): input_data = np.ones([4, 4]).astype('float32') x = paddle.to_tensor(input_data.astype(np.float32), stop_gradient=False) @@ -58,6 +59,12 @@ def test_hook(self): assert test_cap.list == [1, 2, 1] + def test_hook(self): + # _register_void_function_post_hook do not support in eager mode + with _test_eager_guard(): + pass + self.func_hook() + if __name__ == "__main__": unittest.main() From c1116b657ee99f8501ff065578fe8b07de97e889 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 9 Mar 2022 12:57:20 +0800 Subject: [PATCH 49/50] Fix code style (#40344) * fix code style * test=document_fix * fix code style --- python/paddle/profiler/profiler_statistic.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/profiler/profiler_statistic.py b/python/paddle/profiler/profiler_statistic.py index e39871c7365ba..7400f21e91365 100755 --- a/python/paddle/profiler/profiler_statistic.py +++ b/python/paddle/profiler/profiler_statistic.py @@ -170,9 +170,9 @@ def parse(self, nodetrees): CPUTimeRange[hostnode.type].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[hostnode.type] += 1 - if hostnode.type == TracerEventType.Operator and any( - [name in hostnode.name for name in - _CommunicationOpName]): # special case, communication op + if hostnode.type == TracerEventType.Operator and any([ + name in hostnode.name for name in _CommunicationOpName + ]): # special case, communication op CPUTimeRange[TracerEventType.Communication].append( (hostnode.start_ns, hostnode.end_ns)) self.call_times[TracerEventType.Communication] += 1 From e0866dc630dc8dc81567d0644c0688976132eb2c Mon Sep 17 00:00:00 2001 From: WangXi Date: Wed, 9 Mar 2022 13:53:03 +0800 Subject: [PATCH 50/50] [hybrid] fused_feedforward op support tensor model parallel (#40160) --- .../operators/fused/fused_feedforward_op.cc | 2 + .../operators/fused/fused_feedforward_op.cu | 48 ++- .../fluid/tests/unittests/CMakeLists.txt | 2 + ...static_model_parallel_fused_feedforward.py | 384 ++++++++++++++++++ ...static_model_parallel_fused_feedforward.py | 45 ++ 5 files changed, 476 insertions(+), 5 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py create mode 100644 python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae4260441..f3f8f17427577 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955bdd..c38d9f7d4bcbd 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f8102ec408043..be91fb4fdf6d0 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -23,6 +23,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist) list(APPEND DIST_TEST_OPS test_pipeline) list(APPEND DIST_TEST_OPS test_ir_pass_pipeline) list(APPEND DIST_TEST_OPS test_static_model_parallel) +list(APPEND DIST_TEST_OPS test_static_model_parallel_fused_feedforward) list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height) @@ -1150,6 +1151,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120) set_tests_properties(test_ir_pass_pipeline PROPERTIES TIMEOUT 120) set_tests_properties(test_static_model_parallel PROPERTIES TIMEOUT 240) + set_tests_properties(test_static_model_parallel_fused_feedforward PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_split_embedding test_collective_split_embedding_none_divisible test_collective_split_row_linear diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py new file mode 100644 index 0000000000000..5f467da6a6465 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py @@ -0,0 +1,384 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np + +import paddle +import paddle.fluid as fluid +from test_dist_base import TestDistRunnerBase, runtime_main +import paddle.distributed.fleet as fleet + +from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype +from paddle.fluid.dygraph.layers import Layer +from paddle.fluid.layer_helper import LayerHelper +from paddle.nn.initializer import Constant + +paddle.enable_static() + +DTYPE = "float32" +MODEL_PARALLEL_SIZE = 2 +IN_SIZE = 2 * MODEL_PARALLEL_SIZE +OUT_SIZE = 2 * MODEL_PARALLEL_SIZE + + +def fused_feedforward(x, + linear1_weight, + linear2_weight, + linear1_bias=None, + linear2_bias=None, + ln1_scale=None, + ln1_bias=None, + ln2_scale=None, + ln2_bias=None, + dropout1_rate=0.5, + dropout2_rate=0.5, + activation="relu", + ln1_epsilon=1e-5, + ln2_epsilon=1e-5, + pre_layer_norm=False, + training=True, + mode='upscale_in_train', + ring_id=-1, + name=None): + seed = None + if mode not in ('downscale_in_infer', 'upscale_in_train'): + raise ValueError( + "mode argument should be 'downscale_in_infer' or 'upscale_in_train'") + mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode #semantic transfer + + helper = LayerHelper("fused_feedforward") + dtype = x.dtype + check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], + 'fused_feedforward') + check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], + 'fused_feedforward') + + out = helper.create_variable_for_type_inference(x.dtype) + dropout1_mask = helper.create_variable_for_type_inference( + 'uint8', stop_gradient=True) + dropout2_mask = helper.create_variable_for_type_inference( + 'uint8', stop_gradient=True) + ln1_mean = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln1_variance = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln2_mean = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln2_variance = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + linear1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + ln1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + dropout1_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + dropout2_out = helper.create_variable_for_type_inference( + x.dtype, stop_gradient=True) + + if (seed is None or seed == 0) and helper.main_program.random_seed != 0: + seed = helper.main_program.random_seed + + helper.append_op( + type='fused_feedforward', + inputs={ + 'X': x, + 'Linear1Weight': linear1_weight, + 'Linear1Bias': linear1_bias, + 'Linear2Weight': linear2_weight, + 'Linear2Bias': linear2_bias, + 'Ln1Scale': ln1_scale, + 'Ln1Bias': ln1_bias, + 'Ln2Scale': ln2_scale, + 'Ln2Bias': ln2_bias, + }, + outputs={ + 'Out': out, + 'Dropout1Mask': dropout1_mask, + 'Dropout2Mask': dropout2_mask, + 'Ln1Mean': ln1_mean, + 'Ln1Variance': ln1_variance, + 'Ln2Mean': ln2_mean, + 'Ln2Variance': ln2_variance, + 'Linear1Out': linear1_out, + 'Ln1Out': ln1_out, + 'Dropout1Out': dropout1_out, + 'Dropout2Out': dropout2_out, + }, + attrs={ + 'dropout1_rate': dropout1_rate, + 'dropout2_rate': dropout2_rate, + 'act_method': activation, + 'pre_layer_norm': pre_layer_norm, + 'ln1_epsilon': ln1_epsilon, + 'ln2_epsilon': ln2_epsilon, + 'dropout1_is_test': not training, + 'dropout2_is_test': not training, + 'dropout1_fix_seed': seed is not None, + 'dropout2_fix_seed': seed is not None, + 'dropout1_seed': seed if seed is not None else 0, + 'dropout2_seed': seed if seed is not None else 0, + 'dropout1_implementation': mode, + 'dropout2_implementation': mode, + 'ring_id': ring_id, + }) + return out + + +def _set_var_distributed(var): + if var is None: + return + + var.is_distributed = True + + # NOTE: use current_block and find_var_recursive to support while_loop + startup_block = paddle.static.default_startup_program().current_block() + main_block = paddle.static.default_main_program().current_block() + startup_block._find_var_recursive(var.name).is_distributed = True + main_block._find_var_recursive(var.name).is_distributed = True + + +class ParallelFusedFeedForward(Layer): + def __init__(self, + d_model, + dim_feedforward, + dropout_rate=0.1, + epsilon=1e-05, + activation="relu", + act_dropout_rate=None, + normalize_before=False, + linear1_weight_attr=None, + linear1_bias_attr=None, + linear2_weight_attr=None, + linear2_bias_attr=None, + ln1_scale_attr=None, + ln1_bias_attr=None, + ln2_scale_attr=None, + ln2_bias_attr=None, + nranks=1, + ring_id=-1, + name=None): + super(ParallelFusedFeedForward, self).__init__() + assert d_model > 0, ( + "Expected d_model to be greater than 0, but recieved {}".format( + d_model)) + assert dim_feedforward > 0, ( + "Expected dim_feedforward to be greater than 0, but recieved {}". + format(dim_feedforward)) + + self._dtype = self._helper.get_default_dtype() + self._d_model = d_model + + assert dim_feedforward % nranks == 0 + dim_feedforward = dim_feedforward // nranks + self._dim_feedforward = dim_feedforward + self._dropout_rate = dropout_rate + self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate + self._act_method = activation + self._normalize_before = normalize_before + self._epsilon = epsilon + self._ring_id = ring_id + + self._linear1_weight = self.create_parameter( + shape=[d_model, dim_feedforward], + attr=linear1_weight_attr, + dtype=self._dtype, + is_bias=False) + self._linear1_bias = self.create_parameter( + shape=[dim_feedforward], + attr=linear1_bias_attr, + dtype=self._dtype, + is_bias=True) + + self._linear2_weight = self.create_parameter( + shape=[dim_feedforward, d_model], + attr=linear2_weight_attr, + dtype=self._dtype, + is_bias=False) + + self._linear2_bias = self.create_parameter( + shape=[d_model], + attr=linear2_bias_attr, + dtype=self._dtype, + is_bias=True) + + if nranks > 1: + assert ring_id != -1 + # column parallel + _set_var_distributed(self._linear1_weight) + _set_var_distributed(self._linear1_bias) + _set_var_distributed(self._linear2_weight) + + if normalize_before: + self._ln1_scale = self.create_parameter( + shape=[d_model], + attr=ln1_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln1_bias = self.create_parameter( + shape=[d_model], attr=ln1_bias_attr, is_bias=True) + self._ln2_scale = None + self._ln2_bias = None + else: + self._ln1_bias = None + self._ln2_bias = None + self._ln2_scale = self.create_parameter( + shape=[d_model], + attr=ln2_scale_attr, + is_bias=False, + default_initializer=Constant(1.0)) + self._ln2_bias = self.create_parameter( + shape=[d_model], attr=ln2_bias_attr, is_bias=True) + + self.name = name + + def forward(self, src, cache=None): + out = fused_feedforward( + src, + self._linear1_weight, + self._linear2_weight, + self._linear1_bias, + self._linear2_bias, + self._ln1_scale, + self._ln1_bias, + self._ln2_scale, + self._ln2_bias, + dropout1_rate=self._act_dropout_rate, + dropout2_rate=self._dropout_rate, + activation=self._act_method, + ln1_epsilon=self._epsilon, + ln2_epsilon=self._epsilon, + pre_layer_norm=self._normalize_before, + training=self.training, + ring_id=self._ring_id, + name=self.name) + return out + + +def get_param_attr(weight, bias): + weight_attr = paddle.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(weight)) + bias_attr = paddle.ParamAttr( + initializer=fluid.initializer.NumpyArrayInitializer(bias)) + return weight_attr, bias_attr + + +def create_model(data, rank): + np.random.seed(2021) + ln_w = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + ln_b = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + w0 = np.random.uniform(-1, 1, size=(IN_SIZE, OUT_SIZE)).astype(DTYPE) + b0 = np.random.uniform(-1, 1, size=(OUT_SIZE, )).astype(DTYPE) + w1 = np.random.uniform(-1, 1, size=(OUT_SIZE, IN_SIZE)).astype(DTYPE) + b1 = np.random.uniform(-1, 1, size=(IN_SIZE, )).astype(DTYPE) + data.stop_gradient = False + if rank is not None: + start = 0 if rank == 0 else OUT_SIZE // MODEL_PARALLEL_SIZE + end = start + OUT_SIZE // MODEL_PARALLEL_SIZE + col_w0 = w0[:, start:end] + col_b0 = b0[start:end] + row_w1 = w1[start:end, :] + + ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b) + w0_attr, b0_attr = get_param_attr(col_w0, col_b0) + w1_attr, b1_attr = get_param_attr(row_w1, b1) + + ffn = ParallelFusedFeedForward( + IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr, + nranks=MODEL_PARALLEL_SIZE, + ring_id=0) + #ffn.eval() + result = ffn(data) + else: + ln_w_attr, ln_b_attr = get_param_attr(ln_w, ln_b) + w0_attr, b0_attr = get_param_attr(w0, b0) + w1_attr, b1_attr = get_param_attr(w1, b1) + + ffn = ParallelFusedFeedForward( + IN_SIZE, + OUT_SIZE, + dropout_rate=0.0, + activation='gelu', + normalize_before=True, + linear1_weight_attr=w0_attr, + linear1_bias_attr=b0_attr, + linear2_weight_attr=w1_attr, + linear2_bias_attr=b1_attr, + ln1_scale_attr=ln_w_attr, + ln1_bias_attr=ln_b_attr) + #ffn.eval() + result = ffn(data) + + predict = paddle.sum(result) + return predict + + +class TestModelParallel(TestDistRunnerBase): + def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): + # Input data + seq_len = 2 + data_in = fluid.data( + name='data_in', shape=[batch_size, seq_len, IN_SIZE], dtype=DTYPE) + + if dist_strategy: + data_loader = fluid.io.DataLoader.from_generator( + feed_list=[data_in], + capacity=64, + use_double_buffer=False, + iterable=False) + + if dist_strategy: + fleet.init(is_collective=True) + strategy = fleet.DistributedStrategy() + strategy.tensor_parallel = True + strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2} + + rank = fleet.worker_index() if dist_strategy else None + avg_cost = create_model(data_in, rank) + opt = fluid.optimizer.SGD(0.1) + + if dist_strategy: + dist_opt = fleet.distributed_optimizer( + optimizer=opt, strategy=strategy) + dist_opt.minimize(avg_cost) + else: + opt.minimize(avg_cost) + + def gen_data(): + np.random.seed(2021) + while True: + data = [np.random.random([seq_len, IN_SIZE]).astype(DTYPE)] + yield data + + train_reader = paddle.batch(gen_data, batch_size=batch_size) + + if dist_strategy: + return None, avg_cost, train_reader, None, None, None, data_loader + else: + return None, avg_cost, train_reader, None, None, None + + +if __name__ == "__main__": + runtime_main(TestModelParallel) diff --git a/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py new file mode 100644 index 0000000000000..1a6b637e1b45e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_static_model_parallel_fused_feedforward.py @@ -0,0 +1,45 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +from test_dist_base import TestDistBase + +import os +import paddle + +paddle.enable_static() +flag_name = os.path.splitext(__file__)[0] + + +class TestStaticModelParallel(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl_comm_num = 1 + self._pipeline_mode = True + + def test_dist_static_model_parallel_fused_feedforward(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "static_model_parallel_fused_feedforward.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +if __name__ == '__main__': + unittest.main()