From a244f18dffa6e97242a19ee010debb546bfd234d Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 18 May 2021 15:21:14 +0000 Subject: [PATCH 01/34] max_min_prod_all_any --- .../operators/reduce_ops/reduce_all_op.cu | 41 +++++++++- .../operators/reduce_ops/reduce_any_op.cu | 43 +++++++++- .../operators/reduce_ops/reduce_functor_op.h | 71 ++++++++++++++++ .../operators/reduce_ops/reduce_max_op.cu | 52 +++++++++--- .../operators/reduce_ops/reduce_min_op.cu | 52 +++++++++--- .../operators/reduce_ops/reduce_prod_op.cu | 81 +++++++++++++++---- 6 files changed, 295 insertions(+), 45 deletions(-) create mode 100644 paddle/fluid/operators/reduce_ops/reduce_functor_op.h diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 89f3345fcbe42..9ba255081c942 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -13,7 +13,42 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_all, ops::BoolReduceKernel); +namespace paddle { +namespace operators { + +template +class BoolReduceAllKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(true), + CustomLogicalAnd(), detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index c0f94098a351e..2fcf3ba07db38 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,7 +13,42 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_any, ops::BoolReduceKernel); +namespace paddle { +namespace operators { + +template +class BoolReduceAnyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(false), + CustomLogicalOr(), detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h new file mode 100644 index 0000000000000..07625678ca3e9 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct CustomMin { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return (b < a) ? b : a; + } +}; + +template +struct CustomMax { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return (b > a) ? b : a; + } +}; + +template +struct CustomSum { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b + a; + } +}; + +template +struct CustomMul { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b * a; + } +}; + +template +struct CustomLogicalOr { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b || a; + } +}; + +template +struct CustomLogicalAnd { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b && a; + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 832112ede833a..acc06c6ea7a09 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -12,14 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_max, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(-FLT_MAX), CustomMax(), + detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel, + ops::ReduceMaxKernel, ops::ReduceMaxKernel, + ops::ReduceMaxKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 7b2706866f594..6faa314834cba 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -12,14 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_min, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMinKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(FLT_MAX), CustomMin(), + detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel, + ops::ReduceMinKernel, ops::ReduceMinKernel, + ops::ReduceMinKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 44e76c78b1f3e..88bd0a7e9accf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -12,26 +12,75 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +namespace paddle { +namespace operators { + +template +class ReduceProdKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + auto out_dtype = context.Attr("out_dtype"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + if (out_dtype >= 0) { +#define VisitDataTypeSmall_t(cpp_type, proto_type) \ + do { \ + if (static_cast(out_dtype) == \ + proto_type) { \ + TensorReduce, \ + detail::IdentityFunctor>( \ + *input, output, reduce_dims, static_cast(1.0f), \ + CustomMul(), detail::IdentityFunctor(), stream); \ + } \ + } while (0) + _ForEachDataTypeSmall_(VisitDataTypeSmall_t); +#undef VisitDataTypeSmall_t + } else { + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(1.0f), CustomMul(), + detail::IdentityFunctor(), stream); + } + } +}; + +} // namespace operators +} // namespace paddle + #ifdef __HIPCC__ // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 // do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel); #else -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel); #endif From af4db5db4ac616d155225e9e3a6482e52263ddd7 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 24 May 2021 16:12:41 +0800 Subject: [PATCH 02/34] Update reduce_any_op.cu --- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 2fcf3ba07db38..ff0f26ca24307 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From d80406608bed0f5d96e6dc8e4bdaaf48e6b99afb Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 24 May 2021 11:12:09 +0000 Subject: [PATCH 03/34] modified --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 9ba255081c942..957f475134169 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -42,7 +42,7 @@ class BoolReduceAllKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(true), CustomLogicalAnd(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 2fcf3ba07db38..99716758b9a08 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -42,7 +42,7 @@ class BoolReduceAnyKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(false), CustomLogicalOr(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index acc06c6ea7a09..33825da575bd6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -41,7 +41,7 @@ class ReduceMaxKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(-FLT_MAX), CustomMax(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 6faa314834cba..1667d350b9564 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -41,7 +41,7 @@ class ReduceMinKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(FLT_MAX), CustomMin(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 88bd0a7e9accf..e8cfc4d87e4dd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -53,8 +53,8 @@ class ReduceProdKernel : public framework::OpKernel { do { \ if (static_cast(out_dtype) == \ proto_type) { \ - TensorReduce, \ - detail::IdentityFunctor>( \ + TensorReduceFunc, \ + detail::IdentityFunctor>( \ *input, output, reduce_dims, static_cast(1.0f), \ CustomMul(), detail::IdentityFunctor(), stream); \ } \ @@ -62,7 +62,7 @@ class ReduceProdKernel : public framework::OpKernel { _ForEachDataTypeSmall_(VisitDataTypeSmall_t); #undef VisitDataTypeSmall_t } else { - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(1.0f), CustomMul(), detail::IdentityFunctor(), stream); } From 6ea9e9ad7853d226e9962031251e0046e7e2b265 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 24 May 2021 11:23:47 +0000 Subject: [PATCH 04/34] copyright --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 957f475134169..1500d513ce9bf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 405389997067a..ea06b8dfc58f6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 33825da575bd6..dc04b094b4288 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 1667d350b9564..88f654060d4c6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index e8cfc4d87e4dd..27259acc928ea 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From ff0a6e973d3084c41bc411fc38f74b952eee0125 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 02:18:47 +0000 Subject: [PATCH 05/34] modified and {} for loop --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 5 ++++- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 4 +++- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 1500d513ce9bf..ac5e35a350a8d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -34,7 +34,9 @@ class BoolReduceAllKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index ea06b8dfc58f6..0b3799f216328 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -34,7 +34,10 @@ class BoolReduceAnyKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index dc04b094b4288..cbf8101c3668b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -33,7 +33,9 @@ class ReduceMaxKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 88f654060d4c6..406398bb70a9b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -33,7 +33,9 @@ class ReduceMinKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 27259acc928ea..10f2df15a51d0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -35,7 +35,9 @@ class ReduceProdKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); From 7ddaf91d613925f649a9019d2c4ab528a2971b0b Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 18 May 2021 15:21:14 +0000 Subject: [PATCH 06/34] max_min_prod_all_any --- .../operators/reduce_ops/reduce_all_op.cu | 41 +++++++++- .../operators/reduce_ops/reduce_any_op.cu | 43 +++++++++- .../operators/reduce_ops/reduce_functor_op.h | 14 ++++ .../operators/reduce_ops/reduce_max_op.cu | 52 +++++++++--- .../operators/reduce_ops/reduce_min_op.cu | 52 +++++++++--- .../operators/reduce_ops/reduce_prod_op.cu | 81 +++++++++++++++---- 6 files changed, 238 insertions(+), 45 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 89f3345fcbe42..9ba255081c942 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -13,7 +13,42 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_all, ops::BoolReduceKernel); +namespace paddle { +namespace operators { + +template +class BoolReduceAllKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(true), + CustomLogicalAnd(), detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index c0f94098a351e..2fcf3ba07db38 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,7 +13,42 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_any, ops::BoolReduceKernel); +namespace paddle { +namespace operators { + +template +class BoolReduceAnyKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(false), + CustomLogicalOr(), detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index f4ea18edb2a95..2aae10337c4d3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -54,5 +54,19 @@ struct CustomMul { } }; +template +struct CustomLogicalOr { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b || a; + } +}; + +template +struct CustomLogicalAnd { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b && a; + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 832112ede833a..acc06c6ea7a09 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -12,14 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_max, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(-FLT_MAX), CustomMax(), + detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel, + ops::ReduceMaxKernel, ops::ReduceMaxKernel, + ops::ReduceMaxKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 7b2706866f594..6faa314834cba 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -12,14 +12,44 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_min, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMinKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(FLT_MAX), CustomMin(), + detail::IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel, + ops::ReduceMinKernel, ops::ReduceMinKernel, + ops::ReduceMinKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 44e76c78b1f3e..88bd0a7e9accf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -12,26 +12,75 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +namespace paddle { +namespace operators { + +template +class ReduceProdKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + auto out_dtype = context.Attr("out_dtype"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + if (out_dtype >= 0) { +#define VisitDataTypeSmall_t(cpp_type, proto_type) \ + do { \ + if (static_cast(out_dtype) == \ + proto_type) { \ + TensorReduce, \ + detail::IdentityFunctor>( \ + *input, output, reduce_dims, static_cast(1.0f), \ + CustomMul(), detail::IdentityFunctor(), stream); \ + } \ + } while (0) + _ForEachDataTypeSmall_(VisitDataTypeSmall_t); +#undef VisitDataTypeSmall_t + } else { + TensorReduce, detail::IdentityFunctor>( + *input, output, reduce_dims, static_cast(1.0f), CustomMul(), + detail::IdentityFunctor(), stream); + } + } +}; + +} // namespace operators +} // namespace paddle + #ifdef __HIPCC__ // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 // do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel); #else -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel, + ops::ReduceProdKernel); #endif From a43af7dd0d43b3418773a7bf3abab63d98a3d9d4 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 24 May 2021 16:12:41 +0800 Subject: [PATCH 07/34] Update reduce_any_op.cu --- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 2fcf3ba07db38..ff0f26ca24307 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From 0a70b826cb69cbb3a3832ddae6d39efbefc9ef88 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 24 May 2021 11:12:09 +0000 Subject: [PATCH 08/34] modified --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 6 +++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 9ba255081c942..957f475134169 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -42,7 +42,7 @@ class BoolReduceAllKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(true), CustomLogicalAnd(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index ff0f26ca24307..405389997067a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -42,7 +42,7 @@ class BoolReduceAnyKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(false), CustomLogicalOr(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index acc06c6ea7a09..33825da575bd6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -41,7 +41,7 @@ class ReduceMaxKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(-FLT_MAX), CustomMax(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 6faa314834cba..1667d350b9564 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -41,7 +41,7 @@ class ReduceMinKernel : public framework::OpKernel { } auto stream = context.cuda_device_context().stream(); - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(FLT_MAX), CustomMin(), detail::IdentityFunctor(), stream); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 88bd0a7e9accf..e8cfc4d87e4dd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -53,8 +53,8 @@ class ReduceProdKernel : public framework::OpKernel { do { \ if (static_cast(out_dtype) == \ proto_type) { \ - TensorReduce, \ - detail::IdentityFunctor>( \ + TensorReduceFunc, \ + detail::IdentityFunctor>( \ *input, output, reduce_dims, static_cast(1.0f), \ CustomMul(), detail::IdentityFunctor(), stream); \ } \ @@ -62,7 +62,7 @@ class ReduceProdKernel : public framework::OpKernel { _ForEachDataTypeSmall_(VisitDataTypeSmall_t); #undef VisitDataTypeSmall_t } else { - TensorReduce, detail::IdentityFunctor>( + TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(1.0f), CustomMul(), detail::IdentityFunctor(), stream); } From c91b26b7e5ccbbc4ad1f0fee65fec484843e2252 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 24 May 2021 11:23:47 +0000 Subject: [PATCH 09/34] copyright --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 2 +- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 957f475134169..1500d513ce9bf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 405389997067a..ea06b8dfc58f6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 33825da575bd6..dc04b094b4288 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 1667d350b9564..88f654060d4c6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index e8cfc4d87e4dd..27259acc928ea 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From 54651e0b26cae192349ea151c71b0f1db17e4ba1 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 02:18:47 +0000 Subject: [PATCH 10/34] modified and {} for loop --- paddle/fluid/operators/reduce_ops/reduce_all_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 5 ++++- paddle/fluid/operators/reduce_ops/reduce_max_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_min_op.cu | 4 +++- paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 4 +++- 5 files changed, 16 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 1500d513ce9bf..ac5e35a350a8d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -34,7 +34,9 @@ class BoolReduceAllKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index ea06b8dfc58f6..0b3799f216328 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -34,7 +34,10 @@ class BoolReduceAnyKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index dc04b094b4288..cbf8101c3668b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -33,7 +33,9 @@ class ReduceMaxKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 88f654060d4c6..406398bb70a9b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -33,7 +33,9 @@ class ReduceMinKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 27259acc928ea..10f2df15a51d0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -35,7 +35,9 @@ class ReduceProdKernel : public framework::OpKernel { std::vector reduce_dims; if (reduce_all) { reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } } else { for (auto e : dims) { reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); From 35411f7450a34b0bf978dd573a9ac55906c8916a Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 03:19:44 +0000 Subject: [PATCH 11/34] add notes for reduce_op.cuh --- .../fluid/operators/reduce_ops/reduce_op.cuh | 114 ++++++++++++------ 1 file changed, 74 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 91d7fb7c8439a..ed36260c8689f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -65,8 +65,9 @@ static inline int GetLastPow2(int n) { return std::max(1, n - (n >> 1)); } -static inline std::vector GetStrides(const std::vector& dims, - const std::vector& idx) { +// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny +static inline std::vector GetDimStrides(const std::vector& dims, + const std::vector& idx) { int n = static_cast(idx.size()); if (n == 0) return std::vector(); std::vector strides(n); @@ -83,13 +84,15 @@ constexpr int kMaxBlockDim = 256; constexpr int kMaxBlockDim = 512; #endif -static inline int GetDesiredBlockDim(int block_dim) { +// get blockDim for reduceLastDim and reduceAny +static inline int GetBlockDim(int block_dim) { return block_dim >= kMaxBlockDim ? kMaxBlockDim : (1 << static_cast(std::log2(block_dim))); } -static inline void CheckReduceRankIsValid(int reduce_rank, int rank) { +// check reduce rand is valid +static inline void CheckReduceRank(int reduce_rank, int rank) { if (rank % 2 == 0) { PADDLE_ENFORCE_EQ(reduce_rank, rank / 2, platform::errors::InvalidArgument( @@ -108,8 +111,9 @@ static inline void CheckReduceRankIsValid(int reduce_rank, int rank) { } } +// convert dims from vector to array template -static inline paddle::framework::Array from( +static inline paddle::framework::Array VectorToArray( const VectorLikeType& vec) { PADDLE_ENFORCE_EQ(vec.size(), ElementCount, platform::errors::InvalidArgument( @@ -118,17 +122,19 @@ static inline paddle::framework::Array from( vec.size(), ElementCount)); size_t n = static_cast(vec.size()); paddle::framework::Array ret; - for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; + for (size_t i = 0; i < n; ++i) { + ret[i] = vec[i]; + } return ret; } } // namespace detail enum ReduceType { - kReduceAll = 0x00, - kReduceLastDim = 0x01, + kReduceAll = 0x00, // when reduce_rank == x_rank + kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim - kReduceAny = 0x03, + kReduceAny = 0x03, // when reduce_dim.size() > 1 }; // reduce config @@ -141,10 +147,13 @@ struct ReduceConfig { void Run() { // step1: update the reduce_dim left_dim and x_dim SetReduceDim(); + // step2: get the strides of dim for reduceAny and reduceLastDim SetStrides(); + // step3: get the type of reduce SetReduceType(); + // step4: set the block and grid for launch kernel SetBlockDim(); } @@ -237,9 +246,9 @@ struct ReduceConfig { idx_dim.push_back(i); } - x_strides = detail::GetStrides(x_dim, idx_dim); - reduce_strides = detail::GetStrides(x_dim, reduce_dim); - left_strides = detail::GetStrides(x_dim, left_dim); + x_strides = detail::GetDimStrides(x_dim, idx_dim); + reduce_strides = detail::GetDimStrides(x_dim, reduce_dim); + left_strides = detail::GetDimStrides(x_dim, left_dim); reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; left_num = 1; @@ -277,7 +286,7 @@ struct ReduceConfig { // for others: block(block_num, 1) , grid(left_num, 1) void SetBlockDim() { // init - int block_num = detail::GetDesiredBlockDim(reduce_num); + int block_num = detail::GetBlockDim(reduce_num); should_reduce_again = false; dim3 block_dim(block_num, 1); @@ -352,6 +361,9 @@ struct ReduceConfig { dim3 grid; }; +// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, this +// function will be used +// blockId.x -> left_num, threadId.x -> reduce_num template __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, @@ -362,8 +374,9 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, int idx_x = blockIdx.x * reduce_num; int idx_y = threadIdx.x; Ty reduce_var = init; - for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) + for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) { reduce_var = reducer(reduce_var, static_cast(x[idx_x + idx_y])); + } __syncthreads(); reduce_var = @@ -374,6 +387,11 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, } } +// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this +// function will be used +// eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 +// if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32 +// else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 template __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, @@ -389,15 +407,20 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, if (idx < left_num) { int loop = reduce_num - idy; loop = loop > block_size ? block_size : loop; + for (int iy = 0; iy < loop; iy++) { int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num; reduce_var = reducer(reduce_var, static_cast(x[id])); } + y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] = static_cast(transformer(reduce_var)); } } +// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this +// function will be used +// blockId.x -> left_num, threadId.x -> reduce_num template __device__ __forceinline__ void ReduceAny( @@ -423,18 +446,24 @@ __device__ __forceinline__ void ReduceAny( } int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + for (int k = 0; k < Rank; ++k) { + idx_x += (sub_index[k] * x_strides[k]); + } Ty reduce_var = static_cast(x[idx_x]); for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { int reduce_idx = i; + for (int j = 0; j < ReduceRank; ++j) { sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; reduce_idx %= reduce_strides[j]; } int idx_x = 0; - for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + for (int k = 0; k < Rank; ++k) { + idx_x += (sub_index[k] * x_strides[k]); + } + reduce_var = static_cast(reducer(reduce_var, static_cast(x[idx_x]))); } @@ -448,6 +477,7 @@ __device__ __forceinline__ void ReduceAny( } } +// module function designed for global function template __device__ __forceinline__ void ReduceModule( @@ -458,14 +488,17 @@ __device__ __forceinline__ void ReduceModule( paddle::framework::Array reduce_strides, paddle::framework::Array left_dim, paddle::framework::Array left_strides) { + // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1 if (ReduceType == ReduceType::kReduceLastDim) { ReduceLastDim( x, y, reducer, transformer, init, reduce_num); + // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 } else if (ReduceType == ReduceType::kReduceHigherDim) { ReduceHigherDim( x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); + // reduce_rank >= 2 } else { ReduceAny( x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim, @@ -491,23 +524,23 @@ __global__ void ReduceKernelFunction( template -static void launchKernel(const Tx* x_data, Ty* y_data, +static void LaunchKernel(const Tx* x_data, Ty* y_data, const platform::Place& place, const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, gpuStream_t stream, ReduceConfig config) { -#define CUB_REDUCE_TYPE_CASE(type) \ - case type: { \ - constexpr auto kReduceType = type; \ - ReduceKernelFunction< \ - Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \ - kReduceType><<>>( \ - x_data, config.output_data, reducer, transformer, init, \ - config.reduce_num, config.left_num, config.blocking_size, \ - detail::from(config.x_strides), \ - detail::from(config.reduce_dim), \ - detail::from(config.reduce_strides), \ - detail::from(config.left_dim), \ - detail::from(config.left_strides)); \ +#define CUB_REDUCE_TYPE_CASE(type) \ + case type: { \ + constexpr auto kReduceType = type; \ + ReduceKernelFunction< \ + Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \ + kReduceType><<>>( \ + x_data, config.output_data, reducer, transformer, init, \ + config.reduce_num, config.left_num, config.blocking_size, \ + detail::VectorToArray(config.x_strides), \ + detail::VectorToArray(config.reduce_dim), \ + detail::VectorToArray(config.reduce_strides), \ + detail::VectorToArray(config.left_dim), \ + detail::VectorToArray(config.left_strides)); \ } break switch (config.reduce_type) { @@ -525,17 +558,17 @@ static void launchKernel(const Tx* x_data, Ty* y_data, ReduceType::kReduceHigherDim><<>>( config.output_data, y_data, reducer, detail::IdentityFunctor(), init, config.grid.y, config.left_num, config.grid.y, - detail::from(config.x_strides), - detail::from(config.reduce_dim), - detail::from(config.reduce_strides), - detail::from(config.left_dim), - detail::from(config.left_strides)); + detail::VectorToArray(config.x_strides), + detail::VectorToArray(config.reduce_dim), + detail::VectorToArray(config.reduce_strides), + detail::VectorToArray(config.left_dim), + detail::VectorToArray(config.left_strides)); } } template -static void launchReduceKernel(const Tx* x_data, Ty* y_data, +static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, const platform::Place& place, const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, @@ -573,7 +606,7 @@ static void launchReduceKernel(const Tx* x_data, Ty* y_data, return; } - detail::CheckReduceRankIsValid(reduce_rank, rank); + detail::CheckReduceRank(reduce_rank, rank); switch (rank) { CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); @@ -595,6 +628,7 @@ static void launchReduceKernel(const Tx* x_data, Ty* y_data, #undef CUB_REDUCE_RANK_CASE #undef CUB_RANK_CASE } + template void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, const Ty& init, @@ -602,7 +636,7 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, gpuStream_t stream) { auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); - config.Run(); + config.Run(); // get the parameters of LaunchReduceKernel auto x_data = x.data(); auto y_data = y->mutable_data(x.place()); @@ -623,12 +657,12 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, #define CUB_BLOCK_DIM_CASE(block_dim) \ case block_dim: { \ constexpr auto kBlockDim = block_dim; \ - launchReduceKernel( \ + LaunchReduceKernel( \ x_data, y_data, x.place(), reducer, transformer, init, stream, \ config); \ } break - switch (detail::GetDesiredBlockDim(config.reduce_num)) { + switch (detail::GetBlockDim(config.reduce_num)) { CUB_BLOCK_DIM_CASE(512); CUB_BLOCK_DIM_CASE(256); CUB_BLOCK_DIM_CASE(128); From 8cea954ff2cb547ce00e1c30bccbb9f99cdd66f6 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 05:58:12 +0000 Subject: [PATCH 12/34] update --- paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 9 --------- paddle/fluid/operators/reduce_ops/reduce_op.cuh | 6 +++--- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 2aae10337c4d3..15a586d526f74 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -13,15 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/hostdevice.h" -#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index ed36260c8689f..25fd8ceb8ef48 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -160,9 +160,9 @@ struct ReduceConfig { // when should_reduce_again is true, we need malloc temp space for temp data void SetOutputData(Ty* y_data, const platform::Place& place, - framework::Tensor& tmp) { + framework::Tensor* tmp) { if (should_reduce_again) { - output_data = tmp.mutable_data( + output_data = tmp->mutable_data( framework::make_ddim( {static_cast(left_num * grid.y * sizeof(Ty))}), place); @@ -585,7 +585,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, #define CUB_REDUCE_RANK_CASE(i, ...) \ case i: { \ constexpr auto kReduceRank = i; \ - launchKernel( \ + LaunchKernel( \ x_data, y_data, place, reducer, transformer, init, stream, config); \ } break From a719c3cd9b83dd5328a1263d75b8584a4eb18008 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 06:26:05 +0000 Subject: [PATCH 13/34] update --- paddle/fluid/operators/reduce_ops/reduce_op.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 25fd8ceb8ef48..c21095ea389f2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -645,7 +645,7 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, // SetOutputData for ReduceHigherDim when should_reduce_again is true, // temp_output should be stored temp_data in output_data space or stored in // y_data; - config.SetOutputData(y_data, x.place(), tmp); + config.SetOutputData(y_data, x.place(), &tmp); if (config.reduce_num == 1) { auto out_dims = y->dims(); From 2e8ad8fd78b88272b93bd40ff5681457484a73a3 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 25 May 2021 09:31:29 +0000 Subject: [PATCH 14/34] update --- paddle/fluid/operators/reduce_ops/reduce_op.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index c21095ea389f2..5208518865b05 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -79,15 +79,15 @@ static inline std::vector GetDimStrides(const std::vector& dims, } #ifdef __HIPCC__ -constexpr int kMaxBlockDim = 256; +constexpr int kMaxBlock = 256; #else -constexpr int kMaxBlockDim = 512; +constexpr int kMaxBlock = 512; #endif // get blockDim for reduceLastDim and reduceAny static inline int GetBlockDim(int block_dim) { - return block_dim >= kMaxBlockDim - ? kMaxBlockDim + return block_dim >= kMaxBlock + ? kMaxBlock : (1 << static_cast(std::log2(block_dim))); } From a60b90a688760540c99f73f33d2caf170a54f05d Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 27 May 2021 02:50:53 +0000 Subject: [PATCH 15/34] fix a bug in reduce_Op.cuh --- .../operators/reduce_ops/reduce_all_op.cu | 2 +- .../operators/reduce_ops/reduce_any_op.cu | 2 +- .../operators/reduce_ops/reduce_max_op.cu | 2 +- .../operators/reduce_ops/reduce_min_op.cu | 2 +- .../fluid/operators/reduce_ops/reduce_op.cuh | 64 ++++++++++++------- .../operators/reduce_ops/reduce_prod_op.cu | 2 +- 6 files changed, 47 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index ac5e35a350a8d..baea69e739c10 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 0b3799f216328..57e2c4469cfad 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index cbf8101c3668b..a7524c00ad509 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 406398bb70a9b..a81ca77a09bc0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 5208518865b05..0eb402b5e699a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -86,9 +86,8 @@ constexpr int kMaxBlock = 512; // get blockDim for reduceLastDim and reduceAny static inline int GetBlockDim(int block_dim) { - return block_dim >= kMaxBlock - ? kMaxBlock - : (1 << static_cast(std::log2(block_dim))); + return block_dim >= kMaxBlock ? kMaxBlock + : (1 << static_cast(std::log2(block_dim))); } // check reduce rand is valid @@ -177,50 +176,71 @@ struct ReduceConfig { // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] void SetReduceDim() { std::set reduce_set; - for (auto e : reduce_dims_origin) { auto pos = e >= 0 ? e : e + x_dim.size(); reduce_set.insert(pos); } + std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); - // get reduce_dim + + // update reduce_dim and x_dim + std::vector x_new_dim; + + reduce_dim.push_back(reduce_dim_temp[0]); + x_new_dim.push_back(x_dim[0]); + + int idx_reduce = 1; + int num = 0; + if (reduce_dim_temp.size() > 1) { - int num = 0; // for update axis - reduce_dim.push_back(reduce_dim_temp[0]); - for (int idx = 1; idx < reduce_dim_temp.size(); idx++) { - // update x_dim - if (reduce_dim_temp[idx] - reduce_dim_temp[idx - 1] == 1) { - x_dim[reduce_dim_temp[idx - 1]] *= x_dim[reduce_dim_temp[idx]]; - x_dim.erase(x_dim.begin() + reduce_dim_temp[idx]); - num++; + for (int i = 1; i < x_dim.size(); i++) { + if (idx_reduce < reduce_dim_temp.size() && + i == reduce_dim_temp[idx_reduce]) { + int result = + reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; + bool is_equal = (result - num == 1); + if (is_equal) { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + num++; + } else { + reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); + x_new_dim.push_back(x_dim[i]); + } + idx_reduce++; } else { - reduce_dim.push_back(reduce_dim_temp[idx] - num); + x_new_dim.push_back(x_dim[i]); } } } else { - reduce_dim = reduce_dim_temp; + x_new_dim = x_dim; } - // update new_x_dim and new_reduce_dim - std::vector new_x_dim, new_reduce_dim_temp; + // update x_dim + x_dim = x_new_dim; + std::vector().swap(x_new_dim); + + std::vector reduce_dim_new; int is_reduced = 0; for (auto e : reduce_dim) { + auto pos = e >= 0 ? e : e + x_dim.size(); is_reduced |= 1 << e; } + std::vector().swap(reduce_dim); + for (int i = 0; i < x_dim.size(); i++) { if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { - new_x_dim.push_back(x_dim[i]); + x_new_dim.push_back(x_dim[i]); if ((is_reduced >> i) & 1) - new_reduce_dim_temp.push_back(new_x_dim.size() - 1); + reduce_dim_new.push_back(x_new_dim.size() - 1); } else { - new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; } } - x_dim = new_x_dim; - reduce_dim = new_reduce_dim_temp; + x_dim = x_new_dim; + reduce_dim = reduce_dim_new; int x_rank = static_cast(x_dim.size()); std::set left_set; diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 10f2df15a51d0..9f94a1bebf0ea 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From 4bd964486133ff0c1d7f3567f1e56c322165e8a4 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Fri, 28 May 2021 11:26:17 +0000 Subject: [PATCH 16/34] reset reduce_any and reduce_all --- .../operators/reduce_ops/reduce_all_op.cu | 43 ++---------------- .../operators/reduce_ops/reduce_any_op.cu | 44 ++----------------- 2 files changed, 6 insertions(+), 81 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index baea69e739c10..89f3345fcbe42 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -13,44 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -class BoolReduceAllKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - auto stream = context.cuda_device_context().stream(); - TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, static_cast(true), - CustomLogicalAnd(), detail::IdentityFunctor(), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_all, ops::BoolReduceKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index 57e2c4469cfad..c0f94098a351e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -13,45 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -class BoolReduceAnyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - auto stream = context.cuda_device_context().stream(); - TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, static_cast(false), - CustomLogicalOr(), detail::IdentityFunctor(), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_any, ops::BoolReduceKernel); From bf701a226fc9111d4f018fadbe5492d96f61fed3 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 31 May 2021 02:25:06 +0000 Subject: [PATCH 17/34] delete __forceinline__ in reduce_functor_op.h --- .../operators/reduce_ops/reduce_functor_op.h | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 15a586d526f74..069a521245479 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -19,44 +19,36 @@ namespace operators { template struct CustomMin { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + __device__ T operator()(const T &a, const T &b) const { return (b < a) ? b : a; } }; template struct CustomMax { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + __device__ T operator()(const T &a, const T &b) const { return (b > a) ? b : a; } }; template struct CustomSum { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - return b + a; - } + __device__ T operator()(const T &a, const T &b) const { return b + a; } }; template struct CustomMul { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - return b * a; - } + __device__ T operator()(const T &a, const T &b) const { return b * a; } }; template struct CustomLogicalOr { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - return b || a; - } + __device__ T operator()(const T &a, const T &b) const { return b || a; } }; template struct CustomLogicalAnd { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { - return b && a; - } + __device__ T operator()(const T &a, const T &b) const { return b && a; } }; } // namespace operators From 6174b5021d6e97667a7376db5e06f2558805db65 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 31 May 2021 03:53:58 +0000 Subject: [PATCH 18/34] from DEVICE to HOSTTDEVICE --- paddle/fluid/operators/reduce_ops/reduce_op.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 0eb402b5e699a..43666ccc8a940 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -40,17 +40,17 @@ namespace detail { // Post processing function for sum, max, min, prod, any template struct IdentityFunctor { - DEVICE explicit inline IdentityFunctor() {} + HOSTDEVICE explicit inline IdentityFunctor() {} - DEVICE inline T operator()(const T& x) const { return x; } + HOSTDEVICE inline T operator()(const T& x) const { return x; } }; // Post processing function for mean template struct DivideFunctor { - DEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - DEVICE inline T operator()(const T& x) const { return x * n_inv; } + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } private: T n_inv; From 59c32d6f3c484c88becf735d5f29a0d8ac736104 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 1 Jun 2021 11:06:14 +0000 Subject: [PATCH 19/34] add DataBound struct for reduce_max and reduce_min --- .../operators/reduce_ops/reduce_functor_op.h | 35 +++++++++++++++++++ .../operators/reduce_ops/reduce_max_op.cu | 2 +- .../operators/reduce_ops/reduce_min_op.cu | 2 +- 3 files changed, 37 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 069a521245479..718ae6715e502 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -13,10 +13,45 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include +#include namespace paddle { namespace operators { +template +struct DataBound { + static inline T max() { return static_cast(FLT_MAX); } + static inline T min() { return static_cast(-FLT_MAX); } +}; + +template <> +struct DataBound { + static inline float max() { return FLT_MAX; } + static inline float min() { return -FLT_MAX; } +}; + +template <> +struct DataBound { + static inline double max() { return DBL_MAX; } + static inline double min() { return -DBL_MAX; } +}; + +template <> +struct DataBound { + static inline int32_t max() { return INT32_MAX; } + static inline int32_t min() { return INT32_MIN; } +}; + +template <> +struct DataBound { + static inline int64_t max() { return INT64_MAX; } + static inline int64_t min() { return INT64_MIN; } +}; + template struct CustomMin { __device__ T operator()(const T &a, const T &b) const { diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index a7524c00ad509..86a374bc788dc 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -44,7 +44,7 @@ class ReduceMaxKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, static_cast(-FLT_MAX), CustomMax(), + *input, output, reduce_dims, DataBound::min(), CustomMax(), detail::IdentityFunctor(), stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index a81ca77a09bc0..296eff7125bd4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -44,7 +44,7 @@ class ReduceMinKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, static_cast(FLT_MAX), CustomMin(), + *input, output, reduce_dims, DataBound::max(), CustomMin(), detail::IdentityFunctor(), stream); } }; From 790173af23957990286edf1bc8d198684af3ced3 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 1 Jun 2021 20:55:18 +0800 Subject: [PATCH 20/34] Update reduce_functor_op.h --- paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 718ae6715e502..f80a2842f4322 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once -#include #include #include #include From 87008944b55daedc4af11127983fa6edb17b276c Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 2 Jun 2021 11:58:25 +0000 Subject: [PATCH 21/34] update TensorReduceFunc --- .../operators/reduce_ops/reduce_max_op.cu | 2 +- .../operators/reduce_ops/reduce_min_op.cu | 2 +- .../fluid/operators/reduce_ops/reduce_op.cuh | 93 ++++++++++++------- .../operators/reduce_ops/reduce_prod_op.cu | 19 ++-- 4 files changed, 70 insertions(+), 46 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 86a374bc788dc..db65eeeb27394 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -45,7 +45,7 @@ class ReduceMaxKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, DataBound::min(), CustomMax(), - detail::IdentityFunctor(), stream); + detail::IdentityFunctor(), detail::IdentityFunctor(), stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 296eff7125bd4..b5f5a021b379b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -45,7 +45,7 @@ class ReduceMinKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, DataBound::max(), CustomMin(), - detail::IdentityFunctor(), stream); + detail::IdentityFunctor(), detail::IdentityFunctor(), stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 43666ccc8a940..8906264dedd45 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -38,11 +38,13 @@ namespace operators { namespace detail { // Post processing function for sum, max, min, prod, any -template +template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x); + } }; // Post processing function for mean @@ -81,7 +83,7 @@ static inline std::vector GetDimStrides(const std::vector& dims, #ifdef __HIPCC__ constexpr int kMaxBlock = 256; #else -constexpr int kMaxBlock = 512; +constexpr int kMaxBlock = 128; #endif // get blockDim for reduceLastDim and reduceAny @@ -544,8 +546,7 @@ __global__ void ReduceKernelFunction( template -static void LaunchKernel(const Tx* x_data, Ty* y_data, - const platform::Place& place, const ReduceOp& reducer, +static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, gpuStream_t stream, ReduceConfig config) { #define CUB_REDUCE_TYPE_CASE(type) \ @@ -589,7 +590,6 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, template static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, - const platform::Place& place, const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, gpuStream_t stream, ReduceConfig config) { @@ -606,26 +606,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, case i: { \ constexpr auto kReduceRank = i; \ LaunchKernel( \ - x_data, y_data, place, reducer, transformer, init, stream, config); \ + x_data, y_data, reducer, transformer, init, stream, config); \ } break - // launch CUB::Reduce - if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { - cub::TransformInputIterator trans_x( - x_data, transformer); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, - config.reduce_num, reducer, init, stream); - framework::Tensor tmp; - auto* temp_storage = tmp.mutable_data( - framework::make_ddim({static_cast(temp_storage_bytes)}), - place); - cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, - config.reduce_num, reducer, init, stream); - - return; - } - detail::CheckReduceRank(reduce_rank, rank); switch (rank) { CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); @@ -649,10 +632,12 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, #undef CUB_RANK_CASE } -template +template void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, const Ty& init, const ReduceOp& reducer, const TransformOp& transformer, + const CubTransformOp& cub_transformer, gpuStream_t stream) { auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); @@ -673,13 +658,28 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, y->Resize(out_dims); return; } + // launch CUB::Reduce + if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { + cub::TransformInputIterator trans_x( + x_data, cub_transformer); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + config.reduce_num, reducer, init, stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), + x.place()); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + config.reduce_num, reducer, init, stream); + + return; + } -#define CUB_BLOCK_DIM_CASE(block_dim) \ - case block_dim: { \ - constexpr auto kBlockDim = block_dim; \ - LaunchReduceKernel( \ - x_data, y_data, x.place(), reducer, transformer, init, stream, \ - config); \ +#define CUB_BLOCK_DIM_CASE(block_dim) \ + case block_dim: { \ + constexpr auto kBlockDim = block_dim; \ + LaunchReduceKernel( \ + x_data, y_data, reducer, transformer, init, stream, config); \ } break switch (detail::GetBlockDim(config.reduce_num)) { @@ -696,5 +696,36 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, #undef CUB_BLOCK_DIM_CASE } +template class TransformOp> +struct TensorReduceFunctorImpl { + const framework::Tensor& x; + framework::Tensor* y; + std::vector origin_reduce_dims; + const double& init; + const ReduceOp& reducer; + gpuStream_t stream; + TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, + const double& init, const ReduceOp& reducer, + gpuStream_t stream) + : x(x), + y(y), + origin_reduce_dims(origin_reduce_dims), + init(init), + reducer(reducer), + stream(stream) {} + + template + + void apply() const { + const Ty& init_cast = static_cast(init); + TensorReduceFunc, + TransformOp>(x, y, origin_reduce_dims, init_cast, + reducer, TransformOp(), + TransformOp(), stream); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 9f94a1bebf0ea..0d309e278a387 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -51,22 +51,15 @@ class ReduceProdKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); if (out_dtype >= 0) { -#define VisitDataTypeSmall_t(cpp_type, proto_type) \ - do { \ - if (static_cast(out_dtype) == \ - proto_type) { \ - TensorReduceFunc, \ - detail::IdentityFunctor>( \ - *input, output, reduce_dims, static_cast(1.0f), \ - CustomMul(), detail::IdentityFunctor(), stream); \ - } \ - } while (0) - _ForEachDataTypeSmall_(VisitDataTypeSmall_t); -#undef VisitDataTypeSmall_t + framework::VisitDataTypeSmall( + static_cast(out_dtype), + TensorReduceFunctorImpl( + *input, output, reduce_dims, static_cast(1.0f), + cub::Sum(), stream)); } else { TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(1.0f), CustomMul(), - detail::IdentityFunctor(), stream); + detail::IdentityFunctor(), detail::IdentityFunctor(), stream); } } }; From 9e32b0f3071c3bf743c3f9563aa909acaf64a0aa Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 3 Jun 2021 02:39:39 +0000 Subject: [PATCH 22/34] add reduce_functor_op.h pragma once --- paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index f80a2842f4322..31f800c05f6b2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include #include #include From 17dcaf8e04ddffdf2df618b2c04543aed7757250 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Mon, 7 Jun 2021 00:53:54 +0000 Subject: [PATCH 23/34] update BOUND and kMaxTHread --- .../operators/reduce_ops/reduce_functor_op.h | 20 +++++++++---- .../fluid/operators/reduce_ops/reduce_op.cuh | 30 +++++++++---------- .../operators/reduce_ops/reduce_prod_op.cu | 5 ++-- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 31f800c05f6b2..1fcfe2bd9672a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -54,36 +54,44 @@ struct DataBound { template struct CustomMin { - __device__ T operator()(const T &a, const T &b) const { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { return (b < a) ? b : a; } }; template struct CustomMax { - __device__ T operator()(const T &a, const T &b) const { + __device__ __forceinline__ T operator()(const T &a, const T &b) const { return (b > a) ? b : a; } }; template struct CustomSum { - __device__ T operator()(const T &a, const T &b) const { return b + a; } + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b + a; + } }; template struct CustomMul { - __device__ T operator()(const T &a, const T &b) const { return b * a; } + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b * a; + } }; template struct CustomLogicalOr { - __device__ T operator()(const T &a, const T &b) const { return b || a; } + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b || a; + } }; template struct CustomLogicalAnd { - __device__ T operator()(const T &a, const T &b) const { return b && a; } + __device__ __forceinline__ T operator()(const T &a, const T &b) const { + return b && a; + } }; } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh index 8906264dedd45..c6b88a636b646 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh @@ -32,6 +32,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#define BOUND 512 namespace paddle { namespace operators { @@ -81,15 +82,16 @@ static inline std::vector GetDimStrides(const std::vector& dims, } #ifdef __HIPCC__ -constexpr int kMaxBlock = 256; +constexpr int kMaxThread = 256; #else -constexpr int kMaxBlock = 128; +constexpr int kMaxThread = 128; #endif // get blockDim for reduceLastDim and reduceAny static inline int GetBlockDim(int block_dim) { - return block_dim >= kMaxBlock ? kMaxBlock - : (1 << static_cast(std::log2(block_dim))); + return block_dim >= kMaxThread + ? kMaxThread + : (1 << static_cast(std::log2(block_dim))); } // check reduce rand is valid @@ -287,13 +289,15 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - + bool is_large_enough = reduce_num > BOUND / 2 && left_num > BOUND; if (rank == reduce_rank) { reduce_type = static_cast(ReduceType::kReduceAll); } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { reduce_type = static_cast(ReduceType::kReduceLastDim); - } else if (reduce_rank == 1) { + + } else if (reduce_rank == 1 && + (rank == 2 && is_large_enough || rank != 2)) { // ReduceFirstDim and reduceSecondDim reduce_type = static_cast(ReduceType::kReduceHigherDim); @@ -333,7 +337,7 @@ struct ReduceConfig { // init int num_block = (max_threads / left_num); - if (num_block > 1 && reduce_num >= 512) { + if (num_block > 1 && reduce_num >= BOUND) { blocking_size = detail::GetLastPow2(reduce_num / num_block); if (blocking_size <= 1) { @@ -683,7 +687,6 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, } break switch (detail::GetBlockDim(config.reduce_num)) { - CUB_BLOCK_DIM_CASE(512); CUB_BLOCK_DIM_CASE(256); CUB_BLOCK_DIM_CASE(128); CUB_BLOCK_DIM_CASE(64); @@ -696,33 +699,30 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, #undef CUB_BLOCK_DIM_CASE } -template class ReduceOp, template class TransformOp> struct TensorReduceFunctorImpl { const framework::Tensor& x; framework::Tensor* y; std::vector origin_reduce_dims; const double& init; - const ReduceOp& reducer; gpuStream_t stream; TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, - const double& init, const ReduceOp& reducer, - gpuStream_t stream) + const double& init, gpuStream_t stream) : x(x), y(y), origin_reduce_dims(origin_reduce_dims), init(init), - reducer(reducer), stream(stream) {} template void apply() const { const Ty& init_cast = static_cast(init); - TensorReduceFunc, + TensorReduceFunc, TransformOp, TransformOp>(x, y, origin_reduce_dims, init_cast, - reducer, TransformOp(), + ReduceOp(), TransformOp(), TransformOp(), stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 0d309e278a387..6ae62ba2c49ef 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -53,9 +53,8 @@ class ReduceProdKernel : public framework::OpKernel { if (out_dtype >= 0) { framework::VisitDataTypeSmall( static_cast(out_dtype), - TensorReduceFunctorImpl( - *input, output, reduce_dims, static_cast(1.0f), - cub::Sum(), stream)); + TensorReduceFunctorImpl( + *input, output, reduce_dims, static_cast(1.0f), stream)); } else { TensorReduceFunc, detail::IdentityFunctor>( *input, output, reduce_dims, static_cast(1.0f), CustomMul(), From cb2b619e133c293366ace126703af00a36701af0 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 9 Jun 2021 03:16:17 +0000 Subject: [PATCH 24/34] modified max min prod for cu.h --- .../operators/reduce_ops/reduce_functor_op.h | 112 ++++++++----- .../operators/reduce_ops/reduce_max_op.cu | 21 +-- .../operators/reduce_ops/reduce_min_op.cu | 21 +-- .../{reduce_op.cuh => reduce_op.cu.h} | 154 ++++++++---------- .../operators/reduce_ops/reduce_prod_op.cu | 31 +--- 5 files changed, 158 insertions(+), 181 deletions(-) rename paddle/fluid/operators/reduce_ops/{reduce_op.cuh => reduce_op.cu.h} (85%) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 1fcfe2bd9672a..944097ef0e1ab 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -13,83 +13,117 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - -#include -#include #include -#include +#include + +#ifdef __HIPCC__ +#include +#endif namespace paddle { namespace operators { -template -struct DataBound { - static inline T max() { return static_cast(FLT_MAX); } - static inline T min() { return static_cast(-FLT_MAX); } -}; +// Post processing function for sum, max, min, prod, any +template +struct IdentityFunctor { + __device__ explicit inline IdentityFunctor() {} -template <> -struct DataBound { - static inline float max() { return FLT_MAX; } - static inline float min() { return -FLT_MAX; } -}; + __device__ explicit inline IdentityFunctor(int n) {} -template <> -struct DataBound { - static inline double max() { return DBL_MAX; } - static inline double min() { return -DBL_MAX; } + __device__ inline Ty operator()(const Tx &x) const { + return static_cast(x); + } }; -template <> -struct DataBound { - static inline int32_t max() { return INT32_MAX; } - static inline int32_t min() { return INT32_MIN; } -}; +// Post processing function for mean +template +struct DivideFunctor { + __device__ explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + __device__ inline T operator()(const T &x) const { return x * n_inv; } -template <> -struct DataBound { - static inline int64_t max() { return INT64_MAX; } - static inline int64_t min() { return INT64_MIN; } + private: + T n_inv; }; -template +template struct CustomMin { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { + return std::numeric_limits::max(); + } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return (b < a) ? b : a; } }; -template +template struct CustomMax { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { + return std::numeric_limits::min(); + } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return (b > a) ? b : a; } }; -template +// for cub::Reduce +template struct CustomSum { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b + a; } }; -template +template +struct CustomMean { + using Transformer = DivideFunctor; + + __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b + a; + } +}; + +template struct CustomMul { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { return static_cast(1.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b * a; } }; -template +template struct CustomLogicalOr { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { return static_cast(false); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b || a; } }; -template +template struct CustomLogicalAnd { - __device__ __forceinline__ T operator()(const T &a, const T &b) const { + using Transformer = IdentityFunctor; + + __device__ __forceinline__ Ty initial() { return static_cast(true); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b && a; } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index db65eeeb27394..d8b4f13a7442c 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" namespace paddle { @@ -26,26 +26,13 @@ class ReduceMaxKernel : public framework::OpKernel { bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } + std::vector reduce_dims = + detail::GetReduceDim(dims, input->dims().size(), reduce_all); auto stream = context.cuda_device_context().stream(); - TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, DataBound::min(), CustomMax(), - detail::IdentityFunctor(), detail::IdentityFunctor(), stream); + TensorReduceFunc(*input, output, reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index b5f5a021b379b..b81dba15b5da4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" namespace paddle { @@ -26,26 +26,13 @@ class ReduceMinKernel : public framework::OpKernel { bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } + std::vector reduce_dims = + detail::GetReduceDim(dims, input->dims().size(), reduce_all); auto stream = context.cuda_device_context().stream(); - TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, DataBound::max(), CustomMin(), - detail::IdentityFunctor(), detail::IdentityFunctor(), stream); + TensorReduceFunc(*input, output, reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h similarity index 85% rename from paddle/fluid/operators/reduce_ops/reduce_op.cuh rename to paddle/fluid/operators/reduce_ops/reduce_op.cu.h index c6b88a636b646..4e6b0d19ef8a4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -32,32 +32,29 @@ namespace cub = hipcub; #include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#define BOUND 512 +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" + +#define BOUNDARY 512 // Reduce split or not, Whether to use ReduceHigherDim namespace paddle { namespace operators { namespace detail { -// Post processing function for sum, max, min, prod, any -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor() {} - - HOSTDEVICE inline Ty operator()(const Tx& x) const { - return static_cast(x); +static inline std::vector GetReduceDim(const std::vector& dims, + int dim_size, bool reduce_all) { + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(dim_size); + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = i; + } + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + dim_size); + } } -}; - -// Post processing function for mean -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; + return reduce_dims; +} static inline int GetLastPow2(int n) { n |= (n >> 1); @@ -289,7 +286,7 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_large_enough = reduce_num > BOUND / 2 && left_num > BOUND; + bool is_large_enough = (reduce_num > BOUNDARY / 2) || (left_num > BOUNDARY); if (rank == reduce_rank) { reduce_type = static_cast(ReduceType::kReduceAll); @@ -337,7 +334,7 @@ struct ReduceConfig { // init int num_block = (max_threads / left_num); - if (num_block > 1 && reduce_num >= BOUND) { + if (num_block > 1 && reduce_num >= BOUNDARY) { blocking_size = detail::GetLastPow2(reduce_num / num_block); if (blocking_size <= 1) { @@ -394,14 +391,15 @@ template __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, int reduce_num) { __shared__ typename cub::BlockReduce::TempStorage temp_storage; int idx_x = blockIdx.x * reduce_num; int idx_y = threadIdx.x; - Ty reduce_var = init; + Ty reduce_var = reducer.initial(); for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) { - reduce_var = reducer(reduce_var, static_cast(x[idx_x + idx_y])); + reduce_var = + reducer(reduce_var, static_cast(transformer(x[idx_x + idx_y]))); } __syncthreads(); @@ -409,7 +407,7 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); if (threadIdx.x == 0) { - y[blockIdx.x] = transformer(reduce_var); + y[blockIdx.x] = reduce_var; } } @@ -422,13 +420,12 @@ template __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, - Ty init, int reduce_num, - int left_num, int block_size) { + int reduce_num, int left_num, + int block_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * block_size; - Ty temp = init; - Ty reduce_var = init; + Ty reduce_var = reducer.initial(); if (idx < left_num) { int loop = reduce_num - idy; @@ -436,11 +433,11 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, for (int iy = 0; iy < loop; iy++) { int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num; - reduce_var = reducer(reduce_var, static_cast(x[id])); + reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); } y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] = - static_cast(transformer(reduce_var)); + reduce_var; } } @@ -450,7 +447,7 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, template __device__ __forceinline__ void ReduceAny( - const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, + const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, int reduce_num, paddle::framework::Array x_strides, paddle::framework::Array reduce_dim, paddle::framework::Array reduce_strides, @@ -490,8 +487,8 @@ __device__ __forceinline__ void ReduceAny( idx_x += (sub_index[k] * x_strides[k]); } - reduce_var = - static_cast(reducer(reduce_var, static_cast(x[idx_x]))); + reduce_var = static_cast( + reducer(reduce_var, static_cast(transformer(x[idx_x])))); } __syncthreads(); @@ -499,7 +496,7 @@ __device__ __forceinline__ void ReduceAny( cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); if (threadIdx.x == 0) { - y[blockIdx.x] = transformer(reduce_var); + y[blockIdx.x] = reduce_var; } } @@ -507,7 +504,7 @@ __device__ __forceinline__ void ReduceAny( template __device__ __forceinline__ void ReduceModule( - const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, + const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, int reduce_num, int left_num, int blocking_size, paddle::framework::Array x_strides, paddle::framework::Array reduce_dim, @@ -517,17 +514,17 @@ __device__ __forceinline__ void ReduceModule( // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1 if (ReduceType == ReduceType::kReduceLastDim) { ReduceLastDim( - x, y, reducer, transformer, init, reduce_num); + x, y, reducer, transformer, reduce_num); // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 } else if (ReduceType == ReduceType::kReduceHigherDim) { ReduceHigherDim( - x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); + x, y, reducer, transformer, reduce_num, left_num, blocking_size); // reduce_rank >= 2 } else { ReduceAny( - x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim, + x, y, reducer, transformer, reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, left_strides); } } @@ -535,7 +532,7 @@ __device__ __forceinline__ void ReduceModule( template __global__ void ReduceKernelFunction( - const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, + const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, int reduce_num, int left_num, int block_size, paddle::framework::Array x_strides, paddle::framework::Array reduce_dim, @@ -543,24 +540,24 @@ __global__ void ReduceKernelFunction( paddle::framework::Array left_dim, paddle::framework::Array left_strides) { ReduceModule(x, y, reducer, transformer, init, reduce_num, - left_num, block_size, x_strides, reduce_dim, - reduce_strides, left_dim, left_strides); + ReduceType>(x, y, reducer, transformer, reduce_num, left_num, + block_size, x_strides, reduce_dim, reduce_strides, + left_dim, left_strides); } template static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, - const TransformOp& transformer, const Ty& init, - gpuStream_t stream, ReduceConfig config) { + const TransformOp& transformer, gpuStream_t stream, + ReduceConfig config) { #define CUB_REDUCE_TYPE_CASE(type) \ case type: { \ constexpr auto kReduceType = type; \ ReduceKernelFunction< \ Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \ kReduceType><<>>( \ - x_data, config.output_data, reducer, transformer, init, \ - config.reduce_num, config.left_num, config.blocking_size, \ + x_data, config.output_data, reducer, transformer, config.reduce_num, \ + config.left_num, config.blocking_size, \ detail::VectorToArray(config.x_strides), \ detail::VectorToArray(config.reduce_dim), \ detail::VectorToArray(config.reduce_strides), \ @@ -579,10 +576,10 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, dim3 grid(config.grid.x, 1, config.grid.z); ReduceKernelFunction< - Ty, Ty, ReduceOp, detail::IdentityFunctor, 128, kRank, kReduceRank, + Ty, Ty, ReduceOp, IdentityFunctor, 128, kRank, kReduceRank, ReduceType::kReduceHigherDim><<>>( - config.output_data, y_data, reducer, detail::IdentityFunctor(), - init, config.grid.y, config.left_num, config.grid.y, + config.output_data, y_data, reducer, IdentityFunctor(), + config.grid.y, config.left_num, config.grid.y, detail::VectorToArray(config.x_strides), detail::VectorToArray(config.reduce_dim), detail::VectorToArray(config.reduce_strides), @@ -595,7 +592,7 @@ template static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, - const TransformOp& transformer, const Ty& init, + const TransformOp& transformer, gpuStream_t stream, ReduceConfig config) { int reduce_rank = config.reduce_strides.size(); int rank = config.x_strides.size(); @@ -610,7 +607,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, case i: { \ constexpr auto kReduceRank = i; \ LaunchKernel( \ - x_data, y_data, reducer, transformer, init, stream, config); \ + x_data, y_data, reducer, transformer, stream, config); \ } break detail::CheckReduceRank(reduce_rank, rank); @@ -636,13 +633,10 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, #undef CUB_RANK_CASE } -template +template class ReduceOp> void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, const Ty& init, - const ReduceOp& reducer, const TransformOp& transformer, - const CubTransformOp& cub_transformer, - gpuStream_t stream) { + std::vector origin_reduce_dims, gpuStream_t stream) { auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); config.Run(); // get the parameters of LaunchReduceKernel @@ -662,28 +656,33 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, y->Resize(out_dims); return; } + using TransformOp = typename ReduceOp::Transformer; + auto reducer = ReduceOp(); // launch CUB::Reduce if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { - cub::TransformInputIterator trans_x( - x_data, cub_transformer); + cub::TransformInputIterator trans_x( + x_data, TransformOp(config.reduce_num)); size_t temp_storage_bytes = 0; cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, - config.reduce_num, reducer, init, stream); + config.reduce_num, reducer, reducer.initial(), + stream); framework::Tensor tmp; auto* temp_storage = tmp.mutable_data( framework::make_ddim({static_cast(temp_storage_bytes)}), x.place()); cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, - config.reduce_num, reducer, init, stream); + config.reduce_num, reducer, reducer.initial(), + stream); return; } -#define CUB_BLOCK_DIM_CASE(block_dim) \ - case block_dim: { \ - constexpr auto kBlockDim = block_dim; \ - LaunchReduceKernel( \ - x_data, y_data, reducer, transformer, init, stream, config); \ +#define CUB_BLOCK_DIM_CASE(block_dim) \ + case block_dim: { \ + constexpr auto kBlockDim = block_dim; \ + LaunchReduceKernel, TransformOp>( \ + x_data, y_data, reducer, TransformOp(config.reduce_num), stream, \ + config); \ } break switch (detail::GetBlockDim(config.reduce_num)) { @@ -699,31 +698,20 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, #undef CUB_BLOCK_DIM_CASE } -template class ReduceOp, - template class TransformOp> +template class ReduceOp> struct TensorReduceFunctorImpl { const framework::Tensor& x; framework::Tensor* y; std::vector origin_reduce_dims; - const double& init; gpuStream_t stream; TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, - const double& init, gpuStream_t stream) - : x(x), - y(y), - origin_reduce_dims(origin_reduce_dims), - init(init), - stream(stream) {} + gpuStream_t stream) + : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {} template - void apply() const { - const Ty& init_cast = static_cast(init); - TensorReduceFunc, TransformOp, - TransformOp>(x, y, origin_reduce_dims, init_cast, - ReduceOp(), TransformOp(), - TransformOp(), stream); + TensorReduceFunc(x, y, origin_reduce_dims, stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 6ae62ba2c49ef..1e996dcaea5be 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -13,8 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" namespace paddle { @@ -28,37 +27,19 @@ class ReduceProdKernel : public framework::OpKernel { auto* input = context.Input("X"); auto* output = context.Output("Out"); auto out_dtype = context.Attr("out_dtype"); - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = i; - } - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } + std::vector reduce_dims = + detail::GetReduceDim(dims, input->dims().size(), reduce_all); auto stream = context.cuda_device_context().stream(); if (out_dtype >= 0) { framework::VisitDataTypeSmall( static_cast(out_dtype), - TensorReduceFunctorImpl( - *input, output, reduce_dims, static_cast(1.0f), stream)); + TensorReduceFunctorImpl(*input, output, reduce_dims, + stream)); } else { - TensorReduceFunc, detail::IdentityFunctor>( - *input, output, reduce_dims, static_cast(1.0f), CustomMul(), - detail::IdentityFunctor(), detail::IdentityFunctor(), stream); + TensorReduceFunc(*input, output, reduce_dims, stream); } } }; From 6541ffbf330635cdae0b9cf0dbd0278f3984bd55 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 9 Jun 2021 16:33:42 +0000 Subject: [PATCH 25/34] update for struct --- .../operators/reduce_ops/reduce_functor_op.h | 38 ++++--------------- .../fluid/operators/reduce_ops/reduce_op.cu.h | 28 ++++++++++++-- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 944097ef0e1ab..8fb221e86f2bc 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #ifdef __HIPCC__ #include #endif @@ -23,32 +24,9 @@ limitations under the License. */ namespace paddle { namespace operators { -// Post processing function for sum, max, min, prod, any -template -struct IdentityFunctor { - __device__ explicit inline IdentityFunctor() {} - - __device__ explicit inline IdentityFunctor(int n) {} - - __device__ inline Ty operator()(const Tx &x) const { - return static_cast(x); - } -}; - -// Post processing function for mean -template -struct DivideFunctor { - __device__ explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - __device__ inline T operator()(const T &x) const { return x * n_inv; } - - private: - T n_inv; -}; - template struct CustomMin { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return std::numeric_limits::max(); @@ -61,7 +39,7 @@ struct CustomMin { template struct CustomMax { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return std::numeric_limits::min(); @@ -75,7 +53,7 @@ struct CustomMax { // for cub::Reduce template struct CustomSum { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } @@ -86,7 +64,7 @@ struct CustomSum { template struct CustomMean { - using Transformer = DivideFunctor; + using Transformer = detail::DivideFunctor; __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } @@ -97,7 +75,7 @@ struct CustomMean { template struct CustomMul { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return static_cast(1.0f); } @@ -108,7 +86,7 @@ struct CustomMul { template struct CustomLogicalOr { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return static_cast(false); } @@ -119,7 +97,7 @@ struct CustomLogicalOr { template struct CustomLogicalAnd { - using Transformer = IdentityFunctor; + using Transformer = detail::IdentityFunctor; __device__ __forceinline__ Ty initial() { return static_cast(true); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4e6b0d19ef8a4..31f7820dc12d2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -32,7 +32,6 @@ namespace cub = hipcub; #include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #define BOUNDARY 512 // Reduce split or not, Whether to use ReduceHigherDim @@ -40,6 +39,29 @@ namespace paddle { namespace operators { namespace detail { +// Post processing function for sum, max, min, prod, any +template +struct IdentityFunctor { + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx &x) const { + return static_cast(x); + } +}; + +// Post processing function for mean +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T &x) const { return x * n_inv; } + + private: + T n_inv; +}; + + static inline std::vector GetReduceDim(const std::vector& dims, int dim_size, bool reduce_all) { std::vector reduce_dims; @@ -576,9 +598,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, dim3 grid(config.grid.x, 1, config.grid.z); ReduceKernelFunction< - Ty, Ty, ReduceOp, IdentityFunctor, 128, kRank, kReduceRank, + Ty, Ty, ReduceOp, detail::IdentityFunctor, 128, kRank, kReduceRank, ReduceType::kReduceHigherDim><<>>( - config.output_data, y_data, reducer, IdentityFunctor(), + config.output_data, y_data, reducer, detail::IdentityFunctor(config.grid.y), config.grid.y, config.left_num, config.grid.y, detail::VectorToArray(config.x_strides), detail::VectorToArray(config.reduce_dim), From 719e435fc22372ca04bec30c5ba791638d232b39 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 9 Jun 2021 16:39:31 +0000 Subject: [PATCH 26/34] code style reduce_op.cu.h --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 31f7820dc12d2..26925ac2c4e80 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -42,10 +42,9 @@ namespace detail { // Post processing function for sum, max, min, prod, any template struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor(int n) {} - HOSTDEVICE inline Ty operator()(const Tx &x) const { + HOSTDEVICE inline Ty operator()(const Tx& x) const { return static_cast(x); } }; @@ -55,13 +54,12 @@ template struct DivideFunctor { HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - HOSTDEVICE inline T operator()(const T &x) const { return x * n_inv; } + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } private: T n_inv; }; - static inline std::vector GetReduceDim(const std::vector& dims, int dim_size, bool reduce_all) { std::vector reduce_dims; @@ -600,8 +598,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, ReduceKernelFunction< Ty, Ty, ReduceOp, detail::IdentityFunctor, 128, kRank, kReduceRank, ReduceType::kReduceHigherDim><<>>( - config.output_data, y_data, reducer, detail::IdentityFunctor(config.grid.y), - config.grid.y, config.left_num, config.grid.y, + config.output_data, y_data, reducer, + detail::IdentityFunctor(config.grid.y), config.grid.y, + config.left_num, config.grid.y, detail::VectorToArray(config.x_strides), detail::VectorToArray(config.reduce_dim), detail::VectorToArray(config.reduce_strides), From 5045a4919ad3fd1d4f3b2e1e84287297f2e18e1b Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 10 Jun 2021 06:18:17 +0000 Subject: [PATCH 27/34] device to HOSTDEVICE --- .../fluid/operators/reduce_ops/reduce_functor_op.h | 14 +++++++------- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 944097ef0e1ab..0f06dc93db43a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -50,7 +50,7 @@ template struct CustomMin { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { + HOSTDEVICE __forceinline__ Ty initial() { return std::numeric_limits::max(); } @@ -63,7 +63,7 @@ template struct CustomMax { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { + HOSTDEVICE __forceinline__ Ty initial() { return std::numeric_limits::min(); } @@ -77,7 +77,7 @@ template struct CustomSum { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } + HOSTDEVICE __forceinline__ Ty initial() { return static_cast(0.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b + a; @@ -88,7 +88,7 @@ template struct CustomMean { using Transformer = DivideFunctor; - __device__ __forceinline__ Ty initial() { return static_cast(0.0f); } + HOSTDEVICE __forceinline__ Ty initial() { return static_cast(0.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b + a; @@ -99,7 +99,7 @@ template struct CustomMul { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { return static_cast(1.0f); } + HOSTDEVICE __forceinline__ Ty initial() { return static_cast(1.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b * a; @@ -110,7 +110,7 @@ template struct CustomLogicalOr { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { return static_cast(false); } + HOSTDEVICE __forceinline__ Ty initial() { return static_cast(false); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b || a; @@ -121,7 +121,7 @@ template struct CustomLogicalAnd { using Transformer = IdentityFunctor; - __device__ __forceinline__ Ty initial() { return static_cast(true); } + HOSTDEVICE __forceinline__ Ty initial() { return static_cast(true); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b && a; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4e6b0d19ef8a4..9e0a4e00d9d7f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -472,7 +472,7 @@ __device__ __forceinline__ void ReduceAny( for (int k = 0; k < Rank; ++k) { idx_x += (sub_index[k] * x_strides[k]); } - Ty reduce_var = static_cast(x[idx_x]); + Ty reduce_var = static_cast(transformer(x[idx_x])); for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { int reduce_idx = i; From fb69e3d41b8e89d50c9366ccd0fa99457afc6641 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 15 Jun 2021 07:01:13 +0000 Subject: [PATCH 28/34] ReduceCudaKernel --- .../operators/reduce_ops/reduce_functor_op.h | 2 +- .../operators/reduce_ops/reduce_max_op.cu | 33 +++---------- .../operators/reduce_ops/reduce_min_op.cu | 33 +++---------- .../fluid/operators/reduce_ops/reduce_op.cu.h | 28 +++++++++++ .../operators/reduce_ops/reduce_prod_op.cu | 47 ++++--------------- 5 files changed, 51 insertions(+), 92 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 9b10286239bb7..9cd3e1197af24 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -42,7 +42,7 @@ struct CustomMax { using Transformer = detail::IdentityFunctor; HOSTDEVICE __forceinline__ Ty initial() { - return std::numeric_limits::min(); + return std::numeric_limits::lowest(); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index d8b4f13a7442c..f214fcba199a3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -11,34 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -class ReduceMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto dims = context.Attr>("dim"); - - std::vector reduce_dims = - detail::GetReduceDim(dims, input->dims().size(), reduce_all); - - auto stream = context.cuda_device_context().stream(); - TensorReduceFunc(*input, output, reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel, - ops::ReduceMaxKernel, ops::ReduceMaxKernel, - ops::ReduceMaxKernel); +// reduce_max +REGISTER_OP_CUDA_KERNEL( + reduce_max, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index b81dba15b5da4..7806df284d8c0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -11,34 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -class ReduceMinKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto dims = context.Attr>("dim"); - - std::vector reduce_dims = - detail::GetReduceDim(dims, input->dims().size(), reduce_all); - - auto stream = context.cuda_device_context().stream(); - TensorReduceFunc(*input, output, reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel, - ops::ReduceMinKernel, ops::ReduceMinKernel, - ops::ReduceMinKernel); +// reduce_min +REGISTER_OP_CUDA_KERNEL( + reduce_min, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 63b209eeefa6b..476f4b8fb7b77 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -30,6 +30,7 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/array.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" @@ -150,6 +151,8 @@ static inline paddle::framework::Array VectorToArray( } // namespace detail +using Tensor = framework::Tensor; + enum ReduceType { kReduceAll = 0x00, // when reduce_rank == x_rank kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; @@ -736,5 +739,30 @@ struct TensorReduceFunctorImpl { } }; +template class ReduceOp> +class ReduceCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + const Tensor* input = context.Input("X"); + Tensor* output = context.Output("Out"); + auto out_dtype = context.Attr("out_dtype"); + std::vector dims = context.Attr>("dim"); + + std::vector reduce_dims = + detail::GetReduceDim(dims, input->dims().size(), reduce_all); + + gpuStream_t stream = context.cuda_device_context().stream(); + if (out_dtype >= 0) { + framework::VisitDataTypeSmall( + static_cast(out_dtype), + TensorReduceFunctorImpl(*input, output, reduce_dims, + stream)); + } else { + TensorReduceFunc(*input, output, reduce_dims, stream); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 1e996dcaea5be..9232a32ee55f3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -16,46 +16,19 @@ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" -namespace paddle { -namespace operators { - -template -class ReduceProdKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto out_dtype = context.Attr("out_dtype"); - auto dims = context.Attr>("dim"); - - std::vector reduce_dims = - detail::GetReduceDim(dims, input->dims().size(), reduce_all); - - auto stream = context.cuda_device_context().stream(); - if (out_dtype >= 0) { - framework::VisitDataTypeSmall( - static_cast(out_dtype), - TensorReduceFunctorImpl(*input, output, reduce_dims, - stream)); - } else { - TensorReduceFunc(*input, output, reduce_dims, stream); - } - } -}; - -} // namespace operators -} // namespace paddle +// reduce_prod #ifdef __HIPCC__ // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 // do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, - ops::ReduceProdKernel, - ops::ReduceProdKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); #else -REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel, - ops::ReduceProdKernel, - ops::ReduceProdKernel, - ops::ReduceProdKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_prod, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); #endif From b841b340e0b86b380fd1636a38e54a486c1ad410 Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Tue, 15 Jun 2021 07:12:43 +0000 Subject: [PATCH 29/34] REDUCE_SPLIT_BOUNDARY --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 476f4b8fb7b77..5e9b4bff34396 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -34,7 +34,8 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#define BOUNDARY 512 // Reduce split or not, Whether to use ReduceHigherDim +#define REDUCE_SPLIT_BOUNDARY \ + 512 // Reduce split or not, Whether to use ReduceHigherDim namespace paddle { namespace operators { @@ -309,7 +310,8 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_large_enough = (reduce_num > BOUNDARY / 2) || (left_num > BOUNDARY); + bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) || + (left_num > REDUCE_SPLIT_BOUNDARY); if (rank == reduce_rank) { reduce_type = static_cast(ReduceType::kReduceAll); @@ -357,7 +359,7 @@ struct ReduceConfig { // init int num_block = (max_threads / left_num); - if (num_block > 1 && reduce_num >= BOUNDARY) { + if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { blocking_size = detail::GetLastPow2(reduce_num / num_block); if (blocking_size <= 1) { From 1fda4d5364452143072808f4f004752e33e7ac01 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 15 Jun 2021 17:41:01 +0800 Subject: [PATCH 30/34] Update reduce_op.cu.h --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 5e9b4bff34396..98ee5969c509d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -34,8 +34,8 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#define REDUCE_SPLIT_BOUNDARY \ - 512 // Reduce split or not, Whether to use ReduceHigherDim +// Reduce split or not, Whether to use ReduceHigherDim +#define REDUCE_SPLIT_BOUNDARY 512 namespace paddle { namespace operators { From c85ca05c3798964e956128c41ac155911e6568ce Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 16 Jun 2021 06:27:08 +0000 Subject: [PATCH 31/34] rename reduceTensorFunctor --- .../operators/reduce_ops/reduce_functor_op.h | 8 ++-- .../fluid/operators/reduce_ops/reduce_op.cu.h | 42 ++++++++++--------- .../operators/reduce_ops/reduce_prod_op.cu | 1 - 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 9cd3e1197af24..09ecbbc9e0f59 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -13,10 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include +#include #include - #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/platform/hostdevice.h" #ifdef __HIPCC__ #include #endif @@ -29,7 +29,7 @@ struct CustomMin { using Transformer = detail::IdentityFunctor; HOSTDEVICE __forceinline__ Ty initial() { - return std::numeric_limits::max(); + return static_cast(std::numeric_limits::max()); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { @@ -42,7 +42,7 @@ struct CustomMax { using Transformer = detail::IdentityFunctor; HOSTDEVICE __forceinline__ Ty initial() { - return std::numeric_limits::lowest(); + return static_cast(std::numeric_limits::lowest()); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 98ee5969c509d..000933a78f035 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -35,7 +35,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor_util.h" // Reduce split or not, Whether to use ReduceHigherDim -#define REDUCE_SPLIT_BOUNDARY 512 +#define REDUCE_SPLIT_BOUNDARY 512 namespace paddle { namespace operators { @@ -72,6 +72,11 @@ static inline std::vector GetReduceDim(const std::vector& dims, } } else { for (auto e : dims) { + PADDLE_ENFORCE_LT(e, dim_size, + paddle::platform::errors::InvalidArgument( + "ReduceOp: invalid axis, when x_dims is %d, " + "axis[i] should less than x_dims, but got %d.", + dim_size, e)); reduce_dims.push_back(e >= 0 ? e : e + dim_size); } } @@ -108,9 +113,7 @@ constexpr int kMaxThread = 128; // get blockDim for reduceLastDim and reduceAny static inline int GetBlockDim(int block_dim) { - return block_dim >= kMaxThread - ? kMaxThread - : (1 << static_cast(std::log2(block_dim))); + return block_dim >= kMaxThread ? kMaxThread : GetLastPow2(block_dim); } // check reduce rand is valid @@ -220,11 +223,11 @@ struct ReduceConfig { if (reduce_dim_temp.size() > 1) { for (int i = 1; i < x_dim.size(); i++) { - if (idx_reduce < reduce_dim_temp.size() && - i == reduce_dim_temp[idx_reduce]) { + if ((idx_reduce < reduce_dim_temp.size()) && + (i == reduce_dim_temp[idx_reduce])) { int result = reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; - bool is_equal = (result - num == 1); + bool is_equal = ((result - num) == 1); if (is_equal) { x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; num++; @@ -248,7 +251,6 @@ struct ReduceConfig { std::vector reduce_dim_new; int is_reduced = 0; for (auto e : reduce_dim) { - auto pos = e >= 0 ? e : e + x_dim.size(); is_reduced |= 1 << e; } @@ -312,6 +314,7 @@ struct ReduceConfig { int reduce_rank = reduce_dim.size(); bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) || (left_num > REDUCE_SPLIT_BOUNDARY); + if (rank == reduce_rank) { reduce_type = static_cast(ReduceType::kReduceAll); @@ -319,7 +322,7 @@ struct ReduceConfig { reduce_type = static_cast(ReduceType::kReduceLastDim); } else if (reduce_rank == 1 && - (rank == 2 && is_large_enough || rank != 2)) { + ((rank == 2 && is_large_enough) || rank != 2)) { // ReduceFirstDim and reduceSecondDim reduce_type = static_cast(ReduceType::kReduceHigherDim); @@ -661,8 +664,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, template class ReduceOp> -void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, gpuStream_t stream) { +void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, + gpuStream_t stream) { auto x_dim = framework::vectorize(x.dims()); auto config = ReduceConfig(origin_reduce_dims, x_dim); config.Run(); // get the parameters of LaunchReduceKernel @@ -725,19 +729,18 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, } template class ReduceOp> -struct TensorReduceFunctorImpl { +struct TensorReduceFunctor { const framework::Tensor& x; framework::Tensor* y; std::vector origin_reduce_dims; gpuStream_t stream; - TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, - gpuStream_t stream) + TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, gpuStream_t stream) : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {} template void apply() const { - TensorReduceFunc(x, y, origin_reduce_dims, stream); + TensorReduceFunctorImpl(x, y, origin_reduce_dims, stream); } }; @@ -758,10 +761,11 @@ class ReduceCudaKernel : public framework::OpKernel { if (out_dtype >= 0) { framework::VisitDataTypeSmall( static_cast(out_dtype), - TensorReduceFunctorImpl(*input, output, reduce_dims, - stream)); + TensorReduceFunctor(*input, output, reduce_dims, + stream)); } else { - TensorReduceFunc(*input, output, reduce_dims, stream); + TensorReduceFunctorImpl(*input, output, reduce_dims, + stream); } } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 9232a32ee55f3..4f259e415d222 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -17,7 +17,6 @@ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" // reduce_prod - #ifdef __HIPCC__ // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 // do not support double in HIPCC platform (Eigen3 to be fixed) From 9cc8ac3fc70848b698709f5fc6b56c7a071f766e Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Wed, 16 Jun 2021 11:42:39 +0000 Subject: [PATCH 32/34] rename TensorReduceFunc --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 000933a78f035..bdbfbc01f6743 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -729,13 +729,13 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, } template class ReduceOp> -struct TensorReduceFunctor { +struct TensorReduceFunc { const framework::Tensor& x; framework::Tensor* y; std::vector origin_reduce_dims; gpuStream_t stream; - TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y, - std::vector origin_reduce_dims, gpuStream_t stream) + TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, gpuStream_t stream) : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {} template @@ -761,8 +761,7 @@ class ReduceCudaKernel : public framework::OpKernel { if (out_dtype >= 0) { framework::VisitDataTypeSmall( static_cast(out_dtype), - TensorReduceFunctor(*input, output, reduce_dims, - stream)); + TensorReduceFunc(*input, output, reduce_dims, stream)); } else { TensorReduceFunctorImpl(*input, output, reduce_dims, stream); From 140779d56b1afeb669fc30171a106f1e3363a89b Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 17 Jun 2021 05:14:05 +0000 Subject: [PATCH 33/34] delete HOSTDEVICE --- .../operators/reduce_ops/reduce_functor_op.h | 14 +++---- .../fluid/operators/reduce_ops/reduce_op.cu.h | 42 +++++++++---------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h index 09ecbbc9e0f59..0f02be21cc907 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -28,7 +28,7 @@ template struct CustomMin { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { + inline Ty initial() { return static_cast(std::numeric_limits::max()); } @@ -41,7 +41,7 @@ template struct CustomMax { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { + inline Ty initial() { return static_cast(std::numeric_limits::lowest()); } @@ -55,7 +55,7 @@ template struct CustomSum { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { return static_cast(0.0f); } + inline Ty initial() { return static_cast(0.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b + a; @@ -66,7 +66,7 @@ template struct CustomMean { using Transformer = detail::DivideFunctor; - HOSTDEVICE __forceinline__ Ty initial() { return static_cast(0.0f); } + inline Ty initial() { return static_cast(0.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b + a; @@ -77,7 +77,7 @@ template struct CustomMul { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { return static_cast(1.0f); } + inline Ty initial() { return static_cast(1.0f); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b * a; @@ -88,7 +88,7 @@ template struct CustomLogicalOr { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { return static_cast(false); } + inline Ty initial() { return static_cast(false); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b || a; @@ -99,7 +99,7 @@ template struct CustomLogicalAnd { using Transformer = detail::IdentityFunctor; - HOSTDEVICE __forceinline__ Ty initial() { return static_cast(true); } + inline Ty initial() { return static_cast(true); } __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { return b && a; diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index bdbfbc01f6743..4da57015bb105 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -419,12 +419,12 @@ template __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, + TransformOp transformer, Ty init, int reduce_num) { __shared__ typename cub::BlockReduce::TempStorage temp_storage; int idx_x = blockIdx.x * reduce_num; int idx_y = threadIdx.x; - Ty reduce_var = reducer.initial(); + Ty reduce_var = init; for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) { reduce_var = reducer(reduce_var, static_cast(transformer(x[idx_x + idx_y]))); @@ -448,12 +448,12 @@ template __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, - int reduce_num, int left_num, - int block_size) { + Ty init, int reduce_num, + int left_num, int block_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * block_size; - Ty reduce_var = reducer.initial(); + Ty reduce_var = init; if (idx < left_num) { int loop = reduce_num - idy; @@ -532,7 +532,7 @@ __device__ __forceinline__ void ReduceAny( template __device__ __forceinline__ void ReduceModule( - const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, + const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, int reduce_num, int left_num, int blocking_size, paddle::framework::Array x_strides, paddle::framework::Array reduce_dim, @@ -542,12 +542,12 @@ __device__ __forceinline__ void ReduceModule( // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1 if (ReduceType == ReduceType::kReduceLastDim) { ReduceLastDim( - x, y, reducer, transformer, reduce_num); + x, y, reducer, transformer, init, reduce_num); // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 } else if (ReduceType == ReduceType::kReduceHigherDim) { ReduceHigherDim( - x, y, reducer, transformer, reduce_num, left_num, blocking_size); + x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); // reduce_rank >= 2 } else { @@ -560,7 +560,7 @@ __device__ __forceinline__ void ReduceModule( template __global__ void ReduceKernelFunction( - const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, + const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init, int reduce_num, int left_num, int block_size, paddle::framework::Array x_strides, paddle::framework::Array reduce_dim, @@ -568,24 +568,24 @@ __global__ void ReduceKernelFunction( paddle::framework::Array left_dim, paddle::framework::Array left_strides) { ReduceModule(x, y, reducer, transformer, reduce_num, left_num, - block_size, x_strides, reduce_dim, reduce_strides, - left_dim, left_strides); + ReduceType>(x, y, reducer, transformer, init, reduce_num, + left_num, block_size, x_strides, reduce_dim, + reduce_strides, left_dim, left_strides); } template static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, - const TransformOp& transformer, gpuStream_t stream, - ReduceConfig config) { + const TransformOp& transformer, Ty init, + gpuStream_t stream, ReduceConfig config) { #define CUB_REDUCE_TYPE_CASE(type) \ case type: { \ constexpr auto kReduceType = type; \ ReduceKernelFunction< \ Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \ kReduceType><<>>( \ - x_data, config.output_data, reducer, transformer, config.reduce_num, \ - config.left_num, config.blocking_size, \ + x_data, config.output_data, reducer, transformer, init, \ + config.reduce_num, config.left_num, config.blocking_size, \ detail::VectorToArray(config.x_strides), \ detail::VectorToArray(config.reduce_dim), \ detail::VectorToArray(config.reduce_strides), \ @@ -607,7 +607,7 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, Ty, Ty, ReduceOp, detail::IdentityFunctor, 128, kRank, kReduceRank, ReduceType::kReduceHigherDim><<>>( config.output_data, y_data, reducer, - detail::IdentityFunctor(config.grid.y), config.grid.y, + detail::IdentityFunctor(config.grid.y), init, config.grid.y, config.left_num, config.grid.y, detail::VectorToArray(config.x_strides), detail::VectorToArray(config.reduce_dim), @@ -621,7 +621,7 @@ template static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer, - const TransformOp& transformer, + const TransformOp& transformer, Ty init, gpuStream_t stream, ReduceConfig config) { int reduce_rank = config.reduce_strides.size(); int rank = config.x_strides.size(); @@ -636,7 +636,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, case i: { \ constexpr auto kReduceRank = i; \ LaunchKernel( \ - x_data, y_data, reducer, transformer, stream, config); \ + x_data, y_data, reducer, transformer, init, stream, config); \ } break detail::CheckReduceRank(reduce_rank, rank); @@ -711,8 +711,8 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, case block_dim: { \ constexpr auto kBlockDim = block_dim; \ LaunchReduceKernel, TransformOp>( \ - x_data, y_data, reducer, TransformOp(config.reduce_num), stream, \ - config); \ + x_data, y_data, reducer, TransformOp(config.reduce_num), \ + reducer.initial(), stream, config); \ } break switch (detail::GetBlockDim(config.reduce_num)) { From fa3411c800f3f67d34395587f11016541715897f Mon Sep 17 00:00:00 2001 From: niuliling123 Date: Thu, 17 Jun 2021 15:20:57 +0000 Subject: [PATCH 34/34] add left_num * grid.z * grid.y --- paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4da57015bb105..5fad6efdb3496 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -191,7 +191,7 @@ struct ReduceConfig { if (should_reduce_again) { output_data = tmp->mutable_data( framework::make_ddim( - {static_cast(left_num * grid.y * sizeof(Ty))}), + {static_cast(left_num * grid.z * grid.y * sizeof(Ty))}), place); } else { output_data = y_data; @@ -674,10 +674,11 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, auto x_data = x.data(); auto y_data = y->mutable_data(x.place()); - framework::Tensor tmp; + // after config.run() // SetOutputData for ReduceHigherDim when should_reduce_again is true, // temp_output should be stored temp_data in output_data space or stored in // y_data; + framework::Tensor tmp; config.SetOutputData(y_data, x.place(), &tmp); if (config.reduce_num == 1) {