diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 4cc2577f6b2ec..a578c9f7d8108 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -17,15 +17,9 @@ template using CUDAReduceMeanGradKernel = - ops::ReduceGradKernel; - -using FP16CUDAReduceMeanGradKernel = - ops::ReduceGradKernel; + ops::ReduceCudaGradKernel; REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, - FP16CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 4f275717bced8..3b8ea60963d62 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -30,6 +30,7 @@ limitations under the License. */ #if defined(__HIPCC__) || defined(__NVCC__) #include "paddle/pten/kernels/gpu/reduce.h" +#include "paddle/pten/kernels/gpu/reduce_grad.h" #endif namespace paddle { @@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); auto input_data_type = - (in_dtype >= 0) ? static_cast(in_dtype) - : OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); + (out_dtype >= 0) + ? static_cast(out_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); @@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel { dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output); } }; + +template class TransformOp> +class ReduceCudaGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + std::vector dims = context.Attr>("dim"); + auto* in_x = context.Input("X"); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_x = context.Output(framework::GradVarName("X")); + auto out_dtype = context.Attr("in_dtype"); + // get reduce_dim and reduce_num for reduce_mean_grad + int dim_size = in_x->dims().size(); + std::vector reduce_dims = GetReduceDim(dims, dim_size, reduce_all); + auto update_dims = vectorize(d_x->dims()); + int reduce_num = 1; + for (auto i : reduce_dims) { + reduce_num *= (in_x->dims())[i]; + update_dims[i] = 1; + } + // make new tensor + framework::Tensor new_d_out(d_out->type()); + new_d_out.ShareDataWith(*d_out); + new_d_out.Resize(paddle::framework::make_ddim(update_dims)); + auto& dev_ctx = context.cuda_device_context(); + if (out_dtype > 0) { + d_x->mutable_data( + dev_ctx.GetPlace(), + static_cast(out_dtype)); + } else { + d_x->mutable_data( + dev_ctx.GetPlace(), + static_cast(d_out->type())); + } + auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out); + auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); + auto pt_out_dtype = pten::TransToPtenDataType( + static_cast(out_dtype)); + if (out_dtype <= 0) { + pt_out_dtype = pten::TransToPtenDataType( + static_cast(d_out->type())); + } + using MPType = typename kps::details::MPTypeTrait::Type; + pten::ReduceGrad>( + dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, + TransformOp(reduce_num)); + } +}; #endif } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 562a5719d74d9..9a715eb98ef99 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - int in_dtype = ctx.Attr("in_dtype"); + int in_dtype = ctx.Attr("out_dtype"); if (in_dtype >= 0) { return framework::OpKernelType( static_cast(in_dtype), diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 9782ce28da4af..79b3480afbcd7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel { auto dims = context.Attr>("dim"); if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && dims.size() == 1) { - int in_dtype = context.Attr("in_dtype"); + int in_dtype = context.Attr("out_dtype"); if (in_dtype >= 0) { Tensor tmp_tensor; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index c629663b19ebd..c3d3e0cf6ecd5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -17,8 +17,7 @@ template using CUDAReduceSumGradKernel = - ops::ReduceGradKernel; + ops::ReduceCudaGradKernel; REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, CUDAReduceSumGradKernel, diff --git a/paddle/pten/kernels/gpu/elementwise.h b/paddle/pten/kernels/gpu/elementwise.h index c9181a1fdfd6e..3313c1ddcfee2 100644 --- a/paddle/pten/kernels/gpu/elementwise.h +++ b/paddle/pten/kernels/gpu/elementwise.h @@ -134,12 +134,19 @@ struct DimensionsTransform { explicit DimensionsTransform(const std::vector &ins, const pten::framework::DDim &dims, int axis) { - const int N = ins.size(); + const int N = max(static_cast(ins.size()), 2); dim_size = dims.size(); out_dims = pten::framework::vectorize(dims); in_dims.resize(N); - for (int j = 0; j < N; ++j) { - in_dims[j] = pten::framework::vectorize(ins[j]->dims()); + if (ins.size() == 1) { + // when ins.size() = 1, broadcast input to output + in_dims[0] = pten::framework::vectorize(ins[0]->dims()); + in_dims[1] = out_dims; + // Add out_dims to in_dims to avoid errors in dims merging + } else { + for (int j = 0; j < N; ++j) { + in_dims[j] = pten::framework::vectorize(ins[j]->dims()); + } } InputDimensionsExtend(N, axis); diff --git a/paddle/pten/kernels/gpu/reduce_grad.h b/paddle/pten/kernels/gpu/reduce_grad.h new file mode 100644 index 0000000000000..a626f2f70e7fb --- /dev/null +++ b/paddle/pten/kernels/gpu/reduce_grad.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// CUDA and HIP use same api +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#include +#include +#include +#include +#include +#include "paddle/pten/kernels/gpu/elementwise.h" +namespace pten { +template +void ReduceGrad(const GPUContext& dev_ctx, + DenseTensor* d_out, + DenseTensor* d_x, + DataType out_dtype, + Functor functor) { + std::vector inputs = {d_out}; + std::vector outputs = {d_x}; + PD_VISIT_ALL_TYPES( + out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] { + LaunchBroadcastElementwiseCudaKernel( + dev_ctx, inputs, &outputs, 0, functor); + })); +} +} // namespace pten +#endif