From a244f18dffa6e97242a19ee010debb546bfd234d Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 18 May 2021 15:21:14 +0000
Subject: [PATCH 01/34] max_min_prod_all_any

---
 .../operators/reduce_ops/reduce_all_op.cu     | 41 +++++++++-
 .../operators/reduce_ops/reduce_any_op.cu     | 43 +++++++++-
 .../operators/reduce_ops/reduce_functor_op.h  | 71 ++++++++++++++++
 .../operators/reduce_ops/reduce_max_op.cu     | 52 +++++++++---
 .../operators/reduce_ops/reduce_min_op.cu     | 52 +++++++++---
 .../operators/reduce_ops/reduce_prod_op.cu    | 81 +++++++++++++++----
 6 files changed, 295 insertions(+), 45 deletions(-)
 create mode 100644 paddle/fluid/operators/reduce_ops/reduce_functor_op.h
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 89f3345fcbe42..9ba255081c942 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -13,7 +13,42 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
-                                      ops::AllFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BoolReduceAllKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(true),
+        CustomLogicalAnd<T>(), detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index c0f94098a351e..2fcf3ba07db38 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,7 +13,42 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
-                                      ops::AnyFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BoolReduceAnyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(false),
+        CustomLogicalOr<T>(), detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
new file mode 100644
index 0000000000000..07625678ca3e9
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CustomMin {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b < a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomMax {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return (b > a) ? b : a;
+  }
+};
+
+template <typename T>
+struct CustomSum {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b + a;
+  }
+};
+
+template <typename T>
+struct CustomMul {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b * a;
+  }
+};
+
+template <typename T>
+struct CustomLogicalOr {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b || a;
+  }
+};
+
+template <typename T>
+struct CustomLogicalAnd {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b && a;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index 832112ede833a..acc06c6ea7a09 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -12,14 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_max,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MaxFunctor>);
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(-FLT_MAX), CustomMax<T>(),
+        detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel<float>,
+                        ops::ReduceMaxKernel<double>, ops::ReduceMaxKernel<int>,
+                        ops::ReduceMaxKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 7b2706866f594..6faa314834cba 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -12,14 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_min,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MinFunctor>);
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(FLT_MAX), CustomMin<T>(),
+        detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel<float>,
+                        ops::ReduceMinKernel<double>, ops::ReduceMinKernel<int>,
+                        ops::ReduceMinKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 44e76c78b1f3e..88bd0a7e9accf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -12,26 +12,75 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceProdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto out_dtype = context.Attr<int>("out_dtype");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    if (out_dtype >= 0) {
+#define VisitDataTypeSmall_t(cpp_type, proto_type)                             \
+  do {                                                                         \
+    if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
+        proto_type) {                                                          \
+      TensorReduce<T, cpp_type, CustomMul<cpp_type>,                           \
+                   detail::IdentityFunctor<cpp_type>>(                         \
+          *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
+          CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
+    }                                                                          \
+  } while (0)
+      _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
+#undef VisitDataTypeSmall_t
+    } else {
+      TensorReduce<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
+          *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
+          detail::IdentityFunctor<T>(), stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 #ifdef __HIPCC__
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
+                        ops::ReduceProdKernel<int>,
+                        ops::ReduceProdKernel<int64_t>);
 #else
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
+                        ops::ReduceProdKernel<int>,
+                        ops::ReduceProdKernel<double>,
+                        ops::ReduceProdKernel<int64_t>);
 #endif

From af4db5db4ac616d155225e9e3a6482e52263ddd7 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 24 May 2021 16:12:41 +0800
Subject: [PATCH 02/34] Update reduce_any_op.cu

---
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 2fcf3ba07db38..ff0f26ca24307 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From d80406608bed0f5d96e6dc8e4bdaaf48e6b99afb Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Mon, 24 May 2021 11:12:09 +0000
Subject: [PATCH 03/34] modified

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 9ba255081c942..957f475134169 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -42,7 +42,7 @@ class BoolReduceAllKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(true),
         CustomLogicalAnd<T>(), detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 2fcf3ba07db38..99716758b9a08 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -42,7 +42,7 @@ class BoolReduceAnyKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(false),
         CustomLogicalOr<T>(), detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index acc06c6ea7a09..33825da575bd6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -41,7 +41,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(-FLT_MAX), CustomMax<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 6faa314834cba..1667d350b9564 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -41,7 +41,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(FLT_MAX), CustomMin<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 88bd0a7e9accf..e8cfc4d87e4dd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -53,8 +53,8 @@ class ReduceProdKernel : public framework::OpKernel<T> {
   do {                                                                         \
     if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
         proto_type) {                                                          \
-      TensorReduce<T, cpp_type, CustomMul<cpp_type>,                           \
-                   detail::IdentityFunctor<cpp_type>>(                         \
+      TensorReduceFunc<T, cpp_type, CustomMul<cpp_type>,                       \
+                       detail::IdentityFunctor<cpp_type>>(                     \
           *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
           CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
     }                                                                          \
@@ -62,7 +62,7 @@ class ReduceProdKernel : public framework::OpKernel<T> {
       _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
 #undef VisitDataTypeSmall_t
     } else {
-      TensorReduce<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
+      TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
           *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
           detail::IdentityFunctor<T>(), stream);
     }

From 6ea9e9ad7853d226e9962031251e0046e7e2b265 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Mon, 24 May 2021 11:23:47 +0000
Subject: [PATCH 04/34] copyright

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 957f475134169..1500d513ce9bf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 405389997067a..ea06b8dfc58f6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index 33825da575bd6..dc04b094b4288 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 1667d350b9564..88f654060d4c6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index e8cfc4d87e4dd..27259acc928ea 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From ff0a6e973d3084c41bc411fc38f74b952eee0125 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Tue, 25 May 2021 02:18:47 +0000
Subject: [PATCH 05/34] modified and {} for loop

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 5 ++++-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 4 +++-
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 1500d513ce9bf..ac5e35a350a8d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -34,7 +34,9 @@ class BoolReduceAllKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index ea06b8dfc58f6..0b3799f216328 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -34,7 +34,10 @@ class BoolReduceAnyKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
+
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index dc04b094b4288..cbf8101c3668b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -33,7 +33,9 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 88f654060d4c6..406398bb70a9b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -33,7 +33,9 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 27259acc928ea..10f2df15a51d0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -35,7 +35,9 @@ class ReduceProdKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());

From 7ddaf91d613925f649a9019d2c4ab528a2971b0b Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 18 May 2021 15:21:14 +0000
Subject: [PATCH 06/34] max_min_prod_all_any

---
 .../operators/reduce_ops/reduce_all_op.cu     | 41 +++++++++-
 .../operators/reduce_ops/reduce_any_op.cu     | 43 +++++++++-
 .../operators/reduce_ops/reduce_functor_op.h  | 14 ++++
 .../operators/reduce_ops/reduce_max_op.cu     | 52 +++++++++---
 .../operators/reduce_ops/reduce_min_op.cu     | 52 +++++++++---
 .../operators/reduce_ops/reduce_prod_op.cu    | 81 +++++++++++++++----
 6 files changed, 238 insertions(+), 45 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 89f3345fcbe42..9ba255081c942 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -13,7 +13,42 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_all, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
-                                      ops::AllFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BoolReduceAllKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(true),
+        CustomLogicalAnd<T>(), detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index c0f94098a351e..2fcf3ba07db38 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,7 +13,42 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_any, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
-                                      ops::AnyFunctor>);
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BoolReduceAnyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(false),
+        CustomLogicalOr<T>(), detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel<bool>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index f4ea18edb2a95..2aae10337c4d3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -54,5 +54,19 @@ struct CustomMul {
   }
 };
 
+template <typename T>
+struct CustomLogicalOr {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b || a;
+  }
+};
+
+template <typename T>
+struct CustomLogicalAnd {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b && a;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index 832112ede833a..acc06c6ea7a09 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -12,14 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_max,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MaxFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MaxFunctor>);
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(-FLT_MAX), CustomMax<T>(),
+        detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel<float>,
+                        ops::ReduceMaxKernel<double>, ops::ReduceMaxKernel<int>,
+                        ops::ReduceMaxKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 7b2706866f594..6faa314834cba 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -12,14 +12,44 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
-
-REGISTER_OP_CUDA_KERNEL(reduce_min,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::MinFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::MinFunctor>);
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceMinKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    TensorReduce<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
+        *input, output, reduce_dims, static_cast<T>(FLT_MAX), CustomMin<T>(),
+        detail::IdentityFunctor<T>(), stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel<float>,
+                        ops::ReduceMinKernel<double>, ops::ReduceMinKernel<int>,
+                        ops::ReduceMinKernel<int64_t>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 44e76c78b1f3e..88bd0a7e9accf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -12,26 +12,75 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ReduceProdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    auto out_dtype = context.Attr<int>("out_dtype");
+
+    auto dims = context.Attr<std::vector<int>>("dim");
+    bool keep_dim = context.Attr<bool>("keep_dim");
+
+    std::vector<int> reduce_dims;
+    if (reduce_all) {
+      reduce_dims.resize(input->dims().size());
+      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+    } else {
+      for (auto e : dims) {
+        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
+      }
+    }
+
+    int reduce_num = 1;
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_num *= input->dims()[reduce_dims[i]];
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    if (out_dtype >= 0) {
+#define VisitDataTypeSmall_t(cpp_type, proto_type)                             \
+  do {                                                                         \
+    if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
+        proto_type) {                                                          \
+      TensorReduce<T, cpp_type, CustomMul<cpp_type>,                           \
+                   detail::IdentityFunctor<cpp_type>>(                         \
+          *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
+          CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
+    }                                                                          \
+  } while (0)
+      _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
+#undef VisitDataTypeSmall_t
+    } else {
+      TensorReduce<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
+          *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
+          detail::IdentityFunctor<T>(), stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 #ifdef __HIPCC__
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
+                        ops::ReduceProdKernel<int>,
+                        ops::ReduceProdKernel<int64_t>);
 #else
-REGISTER_OP_CUDA_KERNEL(reduce_prod,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          float, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          double, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int, ops::ProdFunctor>,
-                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
-                                          int64_t, ops::ProdFunctor>);
+REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
+                        ops::ReduceProdKernel<int>,
+                        ops::ReduceProdKernel<double>,
+                        ops::ReduceProdKernel<int64_t>);
 #endif

From a43af7dd0d43b3418773a7bf3abab63d98a3d9d4 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 24 May 2021 16:12:41 +0800
Subject: [PATCH 07/34] Update reduce_any_op.cu

---
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 2fcf3ba07db38..ff0f26ca24307 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 0a70b826cb69cbb3a3832ddae6d39efbefc9ef88 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Mon, 24 May 2021 11:12:09 +0000
Subject: [PATCH 08/34] modified

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 9ba255081c942..957f475134169 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -42,7 +42,7 @@ class BoolReduceAllKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(true),
         CustomLogicalAnd<T>(), detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index ff0f26ca24307..405389997067a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -42,7 +42,7 @@ class BoolReduceAnyKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(false),
         CustomLogicalOr<T>(), detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index acc06c6ea7a09..33825da575bd6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -41,7 +41,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(-FLT_MAX), CustomMax<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 6faa314834cba..1667d350b9564 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -41,7 +41,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     }
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduce<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
+    TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, static_cast<T>(FLT_MAX), CustomMin<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 88bd0a7e9accf..e8cfc4d87e4dd 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -53,8 +53,8 @@ class ReduceProdKernel : public framework::OpKernel<T> {
   do {                                                                         \
     if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
         proto_type) {                                                          \
-      TensorReduce<T, cpp_type, CustomMul<cpp_type>,                           \
-                   detail::IdentityFunctor<cpp_type>>(                         \
+      TensorReduceFunc<T, cpp_type, CustomMul<cpp_type>,                       \
+                       detail::IdentityFunctor<cpp_type>>(                     \
           *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
           CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
     }                                                                          \
@@ -62,7 +62,7 @@ class ReduceProdKernel : public framework::OpKernel<T> {
       _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
 #undef VisitDataTypeSmall_t
     } else {
-      TensorReduce<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
+      TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
           *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
           detail::IdentityFunctor<T>(), stream);
     }

From c91b26b7e5ccbbc4ad1f0fee65fec484843e2252 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Mon, 24 May 2021 11:23:47 +0000
Subject: [PATCH 09/34] copyright

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 2 +-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 957f475134169..1500d513ce9bf 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 405389997067a..ea06b8dfc58f6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index 33825da575bd6..dc04b094b4288 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 1667d350b9564..88f654060d4c6 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index e8cfc4d87e4dd..27259acc928ea 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 54651e0b26cae192349ea151c71b0f1db17e4ba1 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Tue, 25 May 2021 02:18:47 +0000
Subject: [PATCH 10/34] modified and {} for loop

---
 paddle/fluid/operators/reduce_ops/reduce_all_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_any_op.cu  | 5 ++++-
 paddle/fluid/operators/reduce_ops/reduce_max_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_min_op.cu  | 4 +++-
 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu | 4 +++-
 5 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index 1500d513ce9bf..ac5e35a350a8d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -34,7 +34,9 @@ class BoolReduceAllKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index ea06b8dfc58f6..0b3799f216328 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -34,7 +34,10 @@ class BoolReduceAnyKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
+
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index dc04b094b4288..cbf8101c3668b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -33,7 +33,9 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 88f654060d4c6..406398bb70a9b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -33,7 +33,9 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 27259acc928ea..10f2df15a51d0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -35,7 +35,9 @@ class ReduceProdKernel : public framework::OpKernel<T> {
     std::vector<int> reduce_dims;
     if (reduce_all) {
       reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i;
+      for (int i = 0; i < reduce_dims.size(); ++i) {
+        reduce_dims[i] = i;
+      }
     } else {
       for (auto e : dims) {
         reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());

From 35411f7450a34b0bf978dd573a9ac55906c8916a Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 25 May 2021 03:19:44 +0000
Subject: [PATCH 11/34] add notes for reduce_op.cuh

---
 .../fluid/operators/reduce_ops/reduce_op.cuh  | 114 ++++++++++++------
 1 file changed, 74 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 91d7fb7c8439a..ed36260c8689f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -65,8 +65,9 @@ static inline int GetLastPow2(int n) {
   return std::max(1, n - (n >> 1));
 }
 
-static inline std::vector<int> GetStrides(const std::vector<int>& dims,
-                                          const std::vector<int>& idx) {
+// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny
+static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
+                                             const std::vector<int>& idx) {
   int n = static_cast<int>(idx.size());
   if (n == 0) return std::vector<int>();
   std::vector<int> strides(n);
@@ -83,13 +84,15 @@ constexpr int kMaxBlockDim = 256;
 constexpr int kMaxBlockDim = 512;
 #endif
 
-static inline int GetDesiredBlockDim(int block_dim) {
+// get blockDim for reduceLastDim and reduceAny
+static inline int GetBlockDim(int block_dim) {
   return block_dim >= kMaxBlockDim
              ? kMaxBlockDim
              : (1 << static_cast<int>(std::log2(block_dim)));
 }
 
-static inline void CheckReduceRankIsValid(int reduce_rank, int rank) {
+// check reduce rand is valid
+static inline void CheckReduceRank(int reduce_rank, int rank) {
   if (rank % 2 == 0) {
     PADDLE_ENFORCE_EQ(reduce_rank, rank / 2,
                       platform::errors::InvalidArgument(
@@ -108,8 +111,9 @@ static inline void CheckReduceRankIsValid(int reduce_rank, int rank) {
   }
 }
 
+// convert dims from vector to array
 template <typename T, size_t ElementCount, typename VectorLikeType>
-static inline paddle::framework::Array<T, ElementCount> from(
+static inline paddle::framework::Array<T, ElementCount> VectorToArray(
     const VectorLikeType& vec) {
   PADDLE_ENFORCE_EQ(vec.size(), ElementCount,
                     platform::errors::InvalidArgument(
@@ -118,17 +122,19 @@ static inline paddle::framework::Array<T, ElementCount> from(
                         vec.size(), ElementCount));
   size_t n = static_cast<size_t>(vec.size());
   paddle::framework::Array<T, ElementCount> ret;
-  for (size_t i = 0; i < n; ++i) ret[i] = vec[i];
+  for (size_t i = 0; i < n; ++i) {
+    ret[i] = vec[i];
+  }
   return ret;
 }
 
 }  // namespace detail
 
 enum ReduceType {
-  kReduceAll = 0x00,
-  kReduceLastDim = 0x01,
+  kReduceAll = 0x00,        // when reduce_rank == x_rank
+  kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
   kReduceHigherDim = 0x02,  // ReduceFirstDim or reduceSecondDim
-  kReduceAny = 0x03,
+  kReduceAny = 0x03,        // when reduce_dim.size() > 1
 };
 
 // reduce config
@@ -141,10 +147,13 @@ struct ReduceConfig {
   void Run() {
     // step1: update the reduce_dim left_dim and x_dim
     SetReduceDim();
+
     // step2: get the strides of dim for reduceAny and reduceLastDim
     SetStrides();
+
     // step3: get the type of reduce
     SetReduceType();
+
     // step4: set the block and grid for launch kernel
     SetBlockDim();
   }
@@ -237,9 +246,9 @@ struct ReduceConfig {
       idx_dim.push_back(i);
     }
 
-    x_strides = detail::GetStrides(x_dim, idx_dim);
-    reduce_strides = detail::GetStrides(x_dim, reduce_dim);
-    left_strides = detail::GetStrides(x_dim, left_dim);
+    x_strides = detail::GetDimStrides(x_dim, idx_dim);
+    reduce_strides = detail::GetDimStrides(x_dim, reduce_dim);
+    left_strides = detail::GetDimStrides(x_dim, left_dim);
     reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]];
 
     left_num = 1;
@@ -277,7 +286,7 @@ struct ReduceConfig {
   // for others: block(block_num, 1) , grid(left_num, 1)
   void SetBlockDim() {
     // init
-    int block_num = detail::GetDesiredBlockDim(reduce_num);
+    int block_num = detail::GetBlockDim(reduce_num);
     should_reduce_again = false;
 
     dim3 block_dim(block_num, 1);
@@ -352,6 +361,9 @@ struct ReduceConfig {
   dim3 grid;
 };
 
+// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, this
+// function will be used
+// blockId.x -> left_num, threadId.x -> reduce_num
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim>
 __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
@@ -362,8 +374,9 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
   int idx_x = blockIdx.x * reduce_num;
   int idx_y = threadIdx.x;
   Ty reduce_var = init;
-  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim)
+  for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) {
     reduce_var = reducer(reduce_var, static_cast<Ty>(x[idx_x + idx_y]));
+  }
   __syncthreads();
 
   reduce_var =
@@ -374,6 +387,11 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
   }
 }
 
+// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this
+// function will be used
+// eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1
+//     if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32
+//     else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
 __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
                                                 ReduceOp reducer,
@@ -389,15 +407,20 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
   if (idx < left_num) {
     int loop = reduce_num - idy;
     loop = loop > block_size ? block_size : loop;
+
     for (int iy = 0; iy < loop; iy++) {
       int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
       reduce_var = reducer(reduce_var, static_cast<Ty>(x[id]));
     }
+
     y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
         static_cast<Ty>(transformer(reduce_var));
   }
 }
 
+// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
+// function will be used
+// blockId.x -> left_num, threadId.x -> reduce_num
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank>
 __device__ __forceinline__ void ReduceAny(
@@ -423,18 +446,24 @@ __device__ __forceinline__ void ReduceAny(
   }
 
   int idx_x = 0;
-  for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+  for (int k = 0; k < Rank; ++k) {
+    idx_x += (sub_index[k] * x_strides[k]);
+  }
   Ty reduce_var = static_cast<Ty>(x[idx_x]);
 
   for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
     int reduce_idx = i;
+
     for (int j = 0; j < ReduceRank; ++j) {
       sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j];
       reduce_idx %= reduce_strides[j];
     }
 
     int idx_x = 0;
-    for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]);
+    for (int k = 0; k < Rank; ++k) {
+      idx_x += (sub_index[k] * x_strides[k]);
+    }
+
     reduce_var =
         static_cast<Ty>(reducer(reduce_var, static_cast<Ty>(x[idx_x])));
   }
@@ -448,6 +477,7 @@ __device__ __forceinline__ void ReduceAny(
   }
 }
 
+// module function designed for global function
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank, int ReduceType>
 __device__ __forceinline__ void ReduceModule(
@@ -458,14 +488,17 @@ __device__ __forceinline__ void ReduceModule(
     paddle::framework::Array<int, ReduceRank> reduce_strides,
     paddle::framework::Array<int, Rank - ReduceRank> left_dim,
     paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
+  // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
   if (ReduceType == ReduceType::kReduceLastDim) {
     ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
         x, y, reducer, transformer, init, reduce_num);
 
+    // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
   } else if (ReduceType == ReduceType::kReduceHigherDim) {
     ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
         x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
 
+    // reduce_rank >= 2
   } else {
     ReduceAny<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank>(
         x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim,
@@ -491,23 +524,23 @@ __global__ void ReduceKernelFunction(
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp, int kRank, int kReduceRank>
-static void launchKernel(const Tx* x_data, Ty* y_data,
+static void LaunchKernel(const Tx* x_data, Ty* y_data,
                          const platform::Place& place, const ReduceOp& reducer,
                          const TransformOp& transformer, const Ty& init,
                          gpuStream_t stream, ReduceConfig<Ty> config) {
-#define CUB_REDUCE_TYPE_CASE(type)                                    \
-  case type: {                                                        \
-    constexpr auto kReduceType = type;                                \
-    ReduceKernelFunction<                                             \
-        Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,  \
-        kReduceType><<<config.grid, config.block, 0, stream>>>(       \
-        x_data, config.output_data, reducer, transformer, init,       \
-        config.reduce_num, config.left_num, config.blocking_size,     \
-        detail::from<int, kRank>(config.x_strides),                   \
-        detail::from<int, kReduceRank>(config.reduce_dim),            \
-        detail::from<int, kReduceRank>(config.reduce_strides),        \
-        detail::from<int, kRank - kReduceRank>(config.left_dim),      \
-        detail::from<int, kRank - kReduceRank>(config.left_strides)); \
+#define CUB_REDUCE_TYPE_CASE(type)                                             \
+  case type: {                                                                 \
+    constexpr auto kReduceType = type;                                         \
+    ReduceKernelFunction<                                                      \
+        Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,           \
+        kReduceType><<<config.grid, config.block, 0, stream>>>(                \
+        x_data, config.output_data, reducer, transformer, init,                \
+        config.reduce_num, config.left_num, config.blocking_size,              \
+        detail::VectorToArray<int, kRank>(config.x_strides),                   \
+        detail::VectorToArray<int, kReduceRank>(config.reduce_dim),            \
+        detail::VectorToArray<int, kReduceRank>(config.reduce_strides),        \
+        detail::VectorToArray<int, kRank - kReduceRank>(config.left_dim),      \
+        detail::VectorToArray<int, kRank - kReduceRank>(config.left_strides)); \
   } break
 
   switch (config.reduce_type) {
@@ -525,17 +558,17 @@ static void launchKernel(const Tx* x_data, Ty* y_data,
         ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
         config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(),
         init, config.grid.y, config.left_num, config.grid.y,
-        detail::from<int, kRank>(config.x_strides),
-        detail::from<int, kReduceRank>(config.reduce_dim),
-        detail::from<int, kReduceRank>(config.reduce_strides),
-        detail::from<int, kRank - kReduceRank>(config.left_dim),
-        detail::from<int, kRank - kReduceRank>(config.left_strides));
+        detail::VectorToArray<int, kRank>(config.x_strides),
+        detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
+        detail::VectorToArray<int, kReduceRank>(config.reduce_strides),
+        detail::VectorToArray<int, kRank - kReduceRank>(config.left_dim),
+        detail::VectorToArray<int, kRank - kReduceRank>(config.left_strides));
   }
 }
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp>
-static void launchReduceKernel(const Tx* x_data, Ty* y_data,
+static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
                                const platform::Place& place,
                                const ReduceOp& reducer,
                                const TransformOp& transformer, const Ty& init,
@@ -573,7 +606,7 @@ static void launchReduceKernel(const Tx* x_data, Ty* y_data,
     return;
   }
 
-  detail::CheckReduceRankIsValid(reduce_rank, rank);
+  detail::CheckReduceRank(reduce_rank, rank);
   switch (rank) {
     CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
 
@@ -595,6 +628,7 @@ static void launchReduceKernel(const Tx* x_data, Ty* y_data,
 #undef CUB_REDUCE_RANK_CASE
 #undef CUB_RANK_CASE
 }
+
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
 void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
                       std::vector<int> origin_reduce_dims, const Ty& init,
@@ -602,7 +636,7 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
                       gpuStream_t stream) {
   auto x_dim = framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
-  config.Run();
+  config.Run();  // get the parameters of LaunchReduceKernel
 
   auto x_data = x.data<Tx>();
   auto y_data = y->mutable_data<Ty>(x.place());
@@ -623,12 +657,12 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 #define CUB_BLOCK_DIM_CASE(block_dim)                                  \
   case block_dim: {                                                    \
     constexpr auto kBlockDim = block_dim;                              \
-    launchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
+    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
         x_data, y_data, x.place(), reducer, transformer, init, stream, \
         config);                                                       \
   } break
 
-  switch (detail::GetDesiredBlockDim(config.reduce_num)) {
+  switch (detail::GetBlockDim(config.reduce_num)) {
     CUB_BLOCK_DIM_CASE(512);
     CUB_BLOCK_DIM_CASE(256);
     CUB_BLOCK_DIM_CASE(128);

From 8cea954ff2cb547ce00e1c30bccbb9f99cdd66f6 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 25 May 2021 05:58:12 +0000
Subject: [PATCH 12/34] update

---
 paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 9 ---------
 paddle/fluid/operators/reduce_ops/reduce_op.cuh       | 6 +++---
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 2aae10337c4d3..15a586d526f74 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -13,15 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/hostdevice.h"
-#include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index ed36260c8689f..25fd8ceb8ef48 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -160,9 +160,9 @@ struct ReduceConfig {
 
   // when should_reduce_again is true, we need malloc temp space for temp data
   void SetOutputData(Ty* y_data, const platform::Place& place,
-                     framework::Tensor& tmp) {
+                     framework::Tensor* tmp) {
     if (should_reduce_again) {
-      output_data = tmp.mutable_data<Ty>(
+      output_data = tmp->mutable_data<Ty>(
           framework::make_ddim(
               {static_cast<int64_t>(left_num * grid.y * sizeof(Ty))}),
           place);
@@ -585,7 +585,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
 #define CUB_REDUCE_RANK_CASE(i, ...)                                           \
   case i: {                                                                    \
     constexpr auto kReduceRank = i;                                            \
-    launchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
+    LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
         x_data, y_data, place, reducer, transformer, init, stream, config);    \
   } break
 

From a719c3cd9b83dd5328a1263d75b8584a4eb18008 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 25 May 2021 06:26:05 +0000
Subject: [PATCH 13/34] update

---
 paddle/fluid/operators/reduce_ops/reduce_op.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 25fd8ceb8ef48..c21095ea389f2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -645,7 +645,7 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
   // SetOutputData for ReduceHigherDim when should_reduce_again is true,
   //   temp_output should be stored temp_data in output_data space or stored in
   //   y_data;
-  config.SetOutputData(y_data, x.place(), tmp);
+  config.SetOutputData(y_data, x.place(), &tmp);
 
   if (config.reduce_num == 1) {
     auto out_dims = y->dims();

From 2e8ad8fd78b88272b93bd40ff5681457484a73a3 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Tue, 25 May 2021 09:31:29 +0000
Subject: [PATCH 14/34] update

---
 paddle/fluid/operators/reduce_ops/reduce_op.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index c21095ea389f2..5208518865b05 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -79,15 +79,15 @@ static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
 }
 
 #ifdef __HIPCC__
-constexpr int kMaxBlockDim = 256;
+constexpr int kMaxBlock = 256;
 #else
-constexpr int kMaxBlockDim = 512;
+constexpr int kMaxBlock = 512;
 #endif
 
 // get blockDim for reduceLastDim and reduceAny
 static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kMaxBlockDim
-             ? kMaxBlockDim
+  return block_dim >= kMaxBlock
+             ? kMaxBlock
              : (1 << static_cast<int>(std::log2(block_dim)));
 }
 

From a60b90a688760540c99f73f33d2caf170a54f05d Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Thu, 27 May 2021 02:50:53 +0000
Subject: [PATCH 15/34] fix a bug in reduce_Op.cuh

---
 .../operators/reduce_ops/reduce_all_op.cu     |  2 +-
 .../operators/reduce_ops/reduce_any_op.cu     |  2 +-
 .../operators/reduce_ops/reduce_max_op.cu     |  2 +-
 .../operators/reduce_ops/reduce_min_op.cu     |  2 +-
 .../fluid/operators/reduce_ops/reduce_op.cuh  | 64 ++++++++++++-------
 .../operators/reduce_ops/reduce_prod_op.cu    |  2 +-
 6 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index ac5e35a350a8d..baea69e739c10 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 0b3799f216328..57e2c4469cfad 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. Any Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index cbf8101c3668b..a7524c00ad509 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 406398bb70a9b..a81ca77a09bc0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 5208518865b05..0eb402b5e699a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -86,9 +86,8 @@ constexpr int kMaxBlock = 512;
 
 // get blockDim for reduceLastDim and reduceAny
 static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kMaxBlock
-             ? kMaxBlock
-             : (1 << static_cast<int>(std::log2(block_dim)));
+  return block_dim >= kMaxBlock ? kMaxBlock
+                                : (1 << static_cast<int>(std::log2(block_dim)));
 }
 
 // check reduce rand is valid
@@ -177,50 +176,71 @@ struct ReduceConfig {
   //     --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1]
   void SetReduceDim() {
     std::set<int> reduce_set;
-
     for (auto e : reduce_dims_origin) {
       auto pos = e >= 0 ? e : e + x_dim.size();
       reduce_set.insert(pos);
     }
+
     std::vector<int> reduce_dim_temp(reduce_set.begin(), reduce_set.end());
     std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end());
-    // get reduce_dim
+
+    // update reduce_dim and x_dim
+    std::vector<int> x_new_dim;
+
+    reduce_dim.push_back(reduce_dim_temp[0]);
+    x_new_dim.push_back(x_dim[0]);
+
+    int idx_reduce = 1;
+    int num = 0;
+
     if (reduce_dim_temp.size() > 1) {
-      int num = 0;  // for update axis
-      reduce_dim.push_back(reduce_dim_temp[0]);
-      for (int idx = 1; idx < reduce_dim_temp.size(); idx++) {
-        // update x_dim
-        if (reduce_dim_temp[idx] - reduce_dim_temp[idx - 1] == 1) {
-          x_dim[reduce_dim_temp[idx - 1]] *= x_dim[reduce_dim_temp[idx]];
-          x_dim.erase(x_dim.begin() + reduce_dim_temp[idx]);
-          num++;
+      for (int i = 1; i < x_dim.size(); i++) {
+        if (idx_reduce < reduce_dim_temp.size() &&
+            i == reduce_dim_temp[idx_reduce]) {
+          int result =
+              reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
+          bool is_equal = (result - num == 1);
+          if (is_equal) {
+            x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
+            num++;
+          } else {
+            reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num);
+            x_new_dim.push_back(x_dim[i]);
+          }
+          idx_reduce++;
         } else {
-          reduce_dim.push_back(reduce_dim_temp[idx] - num);
+          x_new_dim.push_back(x_dim[i]);
         }
       }
     } else {
-      reduce_dim = reduce_dim_temp;
+      x_new_dim = x_dim;
     }
 
-    // update new_x_dim and new_reduce_dim
-    std::vector<int> new_x_dim, new_reduce_dim_temp;
+    // update x_dim
+    x_dim = x_new_dim;
+    std::vector<int>().swap(x_new_dim);
+
+    std::vector<int> reduce_dim_new;
     int is_reduced = 0;
     for (auto e : reduce_dim) {
+      auto pos = e >= 0 ? e : e + x_dim.size();
       is_reduced |= 1 << e;
     }
 
+    std::vector<int>().swap(reduce_dim);
+
     for (int i = 0; i < x_dim.size(); i++) {
       if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) {
-        new_x_dim.push_back(x_dim[i]);
+        x_new_dim.push_back(x_dim[i]);
         if ((is_reduced >> i) & 1)
-          new_reduce_dim_temp.push_back(new_x_dim.size() - 1);
+          reduce_dim_new.push_back(x_new_dim.size() - 1);
       } else {
-        new_x_dim[new_x_dim.size() - 1] *= x_dim[i];
+        x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
       }
     }
 
-    x_dim = new_x_dim;
-    reduce_dim = new_reduce_dim_temp;
+    x_dim = x_new_dim;
+    reduce_dim = reduce_dim_new;
 
     int x_rank = static_cast<int>(x_dim.size());
     std::set<int> left_set;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 10f2df15a51d0..9f94a1bebf0ea 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 4bd964486133ff0c1d7f3567f1e56c322165e8a4 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Fri, 28 May 2021 11:26:17 +0000
Subject: [PATCH 16/34] reset reduce_any and reduce_all

---
 .../operators/reduce_ops/reduce_all_op.cu     | 43 ++----------------
 .../operators/reduce_ops/reduce_any_op.cu     | 44 ++-----------------
 2 files changed, 6 insertions(+), 81 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
index baea69e739c10..89f3345fcbe42 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -13,44 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class BoolReduceAllKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) {
-        reduce_dims[i] = i;
-      }
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomLogicalAnd<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(true),
-        CustomLogicalAnd<T>(), detail::IdentityFunctor<T>(), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_all, ops::BoolReduceAllKernel<bool>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_all, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
+                                      ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
index 57e2c4469cfad..c0f94098a351e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -13,45 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class BoolReduceAnyKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) {
-        reduce_dims[i] = i;
-      }
-
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomLogicalOr<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(false),
-        CustomLogicalOr<T>(), detail::IdentityFunctor<T>(), stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_any, ops::BoolReduceAnyKernel<bool>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_any, ops::BoolReduceKernel<paddle::platform::CUDADeviceContext, bool,
+                                      ops::AnyFunctor>);

From bf701a226fc9111d4f018fadbe5492d96f61fed3 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Mon, 31 May 2021 02:25:06 +0000
Subject: [PATCH 17/34] delete __forceinline__ in reduce_functor_op.h

---
 .../operators/reduce_ops/reduce_functor_op.h  | 20 ++++++-------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 15a586d526f74..069a521245479 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -19,44 +19,36 @@ namespace operators {
 
 template <typename T>
 struct CustomMin {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  __device__ T operator()(const T &a, const T &b) const {
     return (b < a) ? b : a;
   }
 };
 
 template <typename T>
 struct CustomMax {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  __device__ T operator()(const T &a, const T &b) const {
     return (b > a) ? b : a;
   }
 };
 
 template <typename T>
 struct CustomSum {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-    return b + a;
-  }
+  __device__ T operator()(const T &a, const T &b) const { return b + a; }
 };
 
 template <typename T>
 struct CustomMul {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-    return b * a;
-  }
+  __device__ T operator()(const T &a, const T &b) const { return b * a; }
 };
 
 template <typename T>
 struct CustomLogicalOr {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-    return b || a;
-  }
+  __device__ T operator()(const T &a, const T &b) const { return b || a; }
 };
 
 template <typename T>
 struct CustomLogicalAnd {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
-    return b && a;
-  }
+  __device__ T operator()(const T &a, const T &b) const { return b && a; }
 };
 
 }  // namespace operators

From 6174b5021d6e97667a7376db5e06f2558805db65 Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Mon, 31 May 2021 03:53:58 +0000
Subject: [PATCH 18/34] from DEVICE to HOSTTDEVICE

---
 paddle/fluid/operators/reduce_ops/reduce_op.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 0eb402b5e699a..43666ccc8a940 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -40,17 +40,17 @@ namespace detail {
 // Post processing function for sum, max, min, prod, any
 template <typename T>
 struct IdentityFunctor {
-  DEVICE explicit inline IdentityFunctor() {}
+  HOSTDEVICE explicit inline IdentityFunctor() {}
 
-  DEVICE inline T operator()(const T& x) const { return x; }
+  HOSTDEVICE inline T operator()(const T& x) const { return x; }
 };
 
 // Post processing function for mean
 template <typename T>
 struct DivideFunctor {
-  DEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
 
-  DEVICE inline T operator()(const T& x) const { return x * n_inv; }
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
 
  private:
   T n_inv;

From 59c32d6f3c484c88becf735d5f29a0d8ac736104 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Tue, 1 Jun 2021 11:06:14 +0000
Subject: [PATCH 19/34] add DataBound struct for reduce_max and reduce_min

---
 .../operators/reduce_ops/reduce_functor_op.h  | 35 +++++++++++++++++++
 .../operators/reduce_ops/reduce_max_op.cu     |  2 +-
 .../operators/reduce_ops/reduce_min_op.cu     |  2 +-
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 069a521245479..718ae6715e502 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -13,10 +13,45 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <cuda.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <cstdint>
 
 namespace paddle {
 namespace operators {
 
+template <typename T>
+struct DataBound {
+  static inline T max() { return static_cast<T>(FLT_MAX); }
+  static inline T min() { return static_cast<T>(-FLT_MAX); }
+};
+
+template <>
+struct DataBound<float> {
+  static inline float max() { return FLT_MAX; }
+  static inline float min() { return -FLT_MAX; }
+};
+
+template <>
+struct DataBound<double> {
+  static inline double max() { return DBL_MAX; }
+  static inline double min() { return -DBL_MAX; }
+};
+
+template <>
+struct DataBound<int32_t> {
+  static inline int32_t max() { return INT32_MAX; }
+  static inline int32_t min() { return INT32_MIN; }
+};
+
+template <>
+struct DataBound<int64_t> {
+  static inline int64_t max() { return INT64_MAX; }
+  static inline int64_t min() { return INT64_MIN; }
+};
+
 template <typename T>
 struct CustomMin {
   __device__ T operator()(const T &a, const T &b) const {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index a7524c00ad509..86a374bc788dc 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -44,7 +44,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
 
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(-FLT_MAX), CustomMax<T>(),
+        *input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index a81ca77a09bc0..296eff7125bd4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -44,7 +44,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
 
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, static_cast<T>(FLT_MAX), CustomMin<T>(),
+        *input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),
         detail::IdentityFunctor<T>(), stream);
   }
 };

From 790173af23957990286edf1bc8d198684af3ced3 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 1 Jun 2021 20:55:18 +0800
Subject: [PATCH 20/34] Update reduce_functor_op.h

---
 paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 718ae6715e502..f80a2842f4322 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#pragma once
-#include <cuda.h>
 #include <float.h>
 #include <limits.h>
 #include <math.h>

From 87008944b55daedc4af11127983fa6edb17b276c Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 2 Jun 2021 11:58:25 +0000
Subject: [PATCH 21/34]  update TensorReduceFunc

---
 .../operators/reduce_ops/reduce_max_op.cu     |  2 +-
 .../operators/reduce_ops/reduce_min_op.cu     |  2 +-
 .../fluid/operators/reduce_ops/reduce_op.cuh  | 93 ++++++++++++-------
 .../operators/reduce_ops/reduce_prod_op.cu    | 19 ++--
 4 files changed, 70 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index 86a374bc788dc..db65eeeb27394 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -45,7 +45,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),
-        detail::IdentityFunctor<T>(), stream);
+        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index 296eff7125bd4..b5f5a021b379b 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -45,7 +45,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     auto stream = context.cuda_device_context().stream();
     TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
         *input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),
-        detail::IdentityFunctor<T>(), stream);
+        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 43666ccc8a940..8906264dedd45 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -38,11 +38,13 @@ namespace operators {
 namespace detail {
 
 // Post processing function for sum, max, min, prod, any
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct IdentityFunctor {
   HOSTDEVICE explicit inline IdentityFunctor() {}
 
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
 };
 
 // Post processing function for mean
@@ -81,7 +83,7 @@ static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
 #ifdef __HIPCC__
 constexpr int kMaxBlock = 256;
 #else
-constexpr int kMaxBlock = 512;
+constexpr int kMaxBlock = 128;
 #endif
 
 // get blockDim for reduceLastDim and reduceAny
@@ -544,8 +546,7 @@ __global__ void ReduceKernelFunction(
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp, int kRank, int kReduceRank>
-static void LaunchKernel(const Tx* x_data, Ty* y_data,
-                         const platform::Place& place, const ReduceOp& reducer,
+static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
                          const TransformOp& transformer, const Ty& init,
                          gpuStream_t stream, ReduceConfig<Ty> config) {
 #define CUB_REDUCE_TYPE_CASE(type)                                             \
@@ -589,7 +590,6 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data,
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp>
 static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
-                               const platform::Place& place,
                                const ReduceOp& reducer,
                                const TransformOp& transformer, const Ty& init,
                                gpuStream_t stream, ReduceConfig<Ty> config) {
@@ -606,26 +606,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
   case i: {                                                                    \
     constexpr auto kReduceRank = i;                                            \
     LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
-        x_data, y_data, place, reducer, transformer, init, stream, config);    \
+        x_data, y_data, reducer, transformer, init, stream, config);           \
   } break
 
-  // launch CUB::Reduce
-  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
-    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
-        x_data, transformer);
-    size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        place);
-    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
-
-    return;
-  }
-
   detail::CheckReduceRank(reduce_rank, rank);
   switch (rank) {
     CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
@@ -649,10 +632,12 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
 #undef CUB_RANK_CASE
 }
 
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
+          typename CubTransformOp = TransformOp>
 void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
                       std::vector<int> origin_reduce_dims, const Ty& init,
                       const ReduceOp& reducer, const TransformOp& transformer,
+                      const CubTransformOp& cub_transformer,
                       gpuStream_t stream) {
   auto x_dim = framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
@@ -673,13 +658,28 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
     y->Resize(out_dims);
     return;
   }
+  // launch CUB::Reduce
+  if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
+    cub::TransformInputIterator<Ty, CubTransformOp, const Tx*> trans_x(
+        x_data, cub_transformer);
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+    framework::Tensor tmp;
+    auto* temp_storage = tmp.mutable_data<uint8_t>(
+        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+        x.place());
+    cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
+                              config.reduce_num, reducer, init, stream);
+
+    return;
+  }
 
-#define CUB_BLOCK_DIM_CASE(block_dim)                                  \
-  case block_dim: {                                                    \
-    constexpr auto kBlockDim = block_dim;                              \
-    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(      \
-        x_data, y_data, x.place(), reducer, transformer, init, stream, \
-        config);                                                       \
+#define CUB_BLOCK_DIM_CASE(block_dim)                                \
+  case block_dim: {                                                  \
+    constexpr auto kBlockDim = block_dim;                            \
+    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(    \
+        x_data, y_data, reducer, transformer, init, stream, config); \
   } break
 
   switch (detail::GetBlockDim(config.reduce_num)) {
@@ -696,5 +696,36 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 #undef CUB_BLOCK_DIM_CASE
 }
 
+template <typename Tx, typename ReduceOp,
+          template <typename, typename> class TransformOp>
+struct TensorReduceFunctorImpl {
+  const framework::Tensor& x;
+  framework::Tensor* y;
+  std::vector<int> origin_reduce_dims;
+  const double& init;
+  const ReduceOp& reducer;
+  gpuStream_t stream;
+  TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+                          std::vector<int> origin_reduce_dims,
+                          const double& init, const ReduceOp& reducer,
+                          gpuStream_t stream)
+      : x(x),
+        y(y),
+        origin_reduce_dims(origin_reduce_dims),
+        init(init),
+        reducer(reducer),
+        stream(stream) {}
+
+  template <typename Ty>
+
+  void apply() const {
+    const Ty& init_cast = static_cast<Ty>(init);
+    TensorReduceFunc<Tx, Ty, ReduceOp, TransformOp<Ty, Ty>,
+                     TransformOp<Tx, Ty>>(x, y, origin_reduce_dims, init_cast,
+                                          reducer, TransformOp<Ty, Ty>(),
+                                          TransformOp<Tx, Ty>(), stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 9f94a1bebf0ea..0d309e278a387 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -51,22 +51,15 @@ class ReduceProdKernel : public framework::OpKernel<T> {
 
     auto stream = context.cuda_device_context().stream();
     if (out_dtype >= 0) {
-#define VisitDataTypeSmall_t(cpp_type, proto_type)                             \
-  do {                                                                         \
-    if (static_cast<framework::proto::VarType::Type>(out_dtype) ==             \
-        proto_type) {                                                          \
-      TensorReduceFunc<T, cpp_type, CustomMul<cpp_type>,                       \
-                       detail::IdentityFunctor<cpp_type>>(                     \
-          *input, output, reduce_dims, static_cast<cpp_type>(1.0f),            \
-          CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
-    }                                                                          \
-  } while (0)
-      _ForEachDataTypeSmall_(VisitDataTypeSmall_t);
-#undef VisitDataTypeSmall_t
+      framework::VisitDataTypeSmall(
+          static_cast<framework::proto::VarType::Type>(out_dtype),
+          TensorReduceFunctorImpl<T, cub::Sum, detail::IdentityFunctor>(
+              *input, output, reduce_dims, static_cast<double>(1.0f),
+              cub::Sum(), stream));
     } else {
       TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
           *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
-          detail::IdentityFunctor<T>(), stream);
+          detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
     }
   }
 };

From 9e32b0f3071c3bf743c3f9563aa909acaf64a0aa Mon Sep 17 00:00:00 2001
From: niuliling123 <root@yq01-qianmo-com-255-129-13.yq01.baidu.com>
Date: Thu, 3 Jun 2021 02:39:39 +0000
Subject: [PATCH 22/34] add reduce_functor_op.h pragma once

---
 paddle/fluid/operators/reduce_ops/reduce_functor_op.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index f80a2842f4322..31f800c05f6b2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 #include <float.h>
 #include <limits.h>
 #include <math.h>

From 17dcaf8e04ddffdf2df618b2c04543aed7757250 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Mon, 7 Jun 2021 00:53:54 +0000
Subject: [PATCH 23/34] update BOUND and kMaxTHread

---
 .../operators/reduce_ops/reduce_functor_op.h  | 20 +++++++++----
 .../fluid/operators/reduce_ops/reduce_op.cuh  | 30 +++++++++----------
 .../operators/reduce_ops/reduce_prod_op.cu    |  5 ++--
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 31f800c05f6b2..1fcfe2bd9672a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -54,36 +54,44 @@ struct DataBound<int64_t> {
 
 template <typename T>
 struct CustomMin {
-  __device__ T operator()(const T &a, const T &b) const {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
     return (b < a) ? b : a;
   }
 };
 
 template <typename T>
 struct CustomMax {
-  __device__ T operator()(const T &a, const T &b) const {
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
     return (b > a) ? b : a;
   }
 };
 
 template <typename T>
 struct CustomSum {
-  __device__ T operator()(const T &a, const T &b) const { return b + a; }
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b + a;
+  }
 };
 
 template <typename T>
 struct CustomMul {
-  __device__ T operator()(const T &a, const T &b) const { return b * a; }
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b * a;
+  }
 };
 
 template <typename T>
 struct CustomLogicalOr {
-  __device__ T operator()(const T &a, const T &b) const { return b || a; }
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b || a;
+  }
 };
 
 template <typename T>
 struct CustomLogicalAnd {
-  __device__ T operator()(const T &a, const T &b) const { return b && a; }
+  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+    return b && a;
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
index 8906264dedd45..c6b88a636b646 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cuh
@@ -32,6 +32,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#define BOUND 512
 
 namespace paddle {
 namespace operators {
@@ -81,15 +82,16 @@ static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
 }
 
 #ifdef __HIPCC__
-constexpr int kMaxBlock = 256;
+constexpr int kMaxThread = 256;
 #else
-constexpr int kMaxBlock = 128;
+constexpr int kMaxThread = 128;
 #endif
 
 // get blockDim for reduceLastDim and reduceAny
 static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kMaxBlock ? kMaxBlock
-                                : (1 << static_cast<int>(std::log2(block_dim)));
+  return block_dim >= kMaxThread
+             ? kMaxThread
+             : (1 << static_cast<int>(std::log2(block_dim)));
 }
 
 // check reduce rand is valid
@@ -287,13 +289,15 @@ struct ReduceConfig {
   void SetReduceType() {
     int rank = x_dim.size();
     int reduce_rank = reduce_dim.size();
-
+    bool is_large_enough = reduce_num > BOUND / 2 && left_num > BOUND;
     if (rank == reduce_rank) {
       reduce_type = static_cast<int>(ReduceType::kReduceAll);
 
     } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) {
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
-    } else if (reduce_rank == 1) {
+
+    } else if (reduce_rank == 1 &&
+               (rank == 2 && is_large_enough || rank != 2)) {
       // ReduceFirstDim and reduceSecondDim
       reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
 
@@ -333,7 +337,7 @@ struct ReduceConfig {
       // init
       int num_block = (max_threads / left_num);
 
-      if (num_block > 1 && reduce_num >= 512) {
+      if (num_block > 1 && reduce_num >= BOUND) {
         blocking_size = detail::GetLastPow2(reduce_num / num_block);
 
         if (blocking_size <= 1) {
@@ -683,7 +687,6 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
   } break
 
   switch (detail::GetBlockDim(config.reduce_num)) {
-    CUB_BLOCK_DIM_CASE(512);
     CUB_BLOCK_DIM_CASE(256);
     CUB_BLOCK_DIM_CASE(128);
     CUB_BLOCK_DIM_CASE(64);
@@ -696,33 +699,30 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 #undef CUB_BLOCK_DIM_CASE
 }
 
-template <typename Tx, typename ReduceOp,
+template <typename Tx, template <typename> class ReduceOp,
           template <typename, typename> class TransformOp>
 struct TensorReduceFunctorImpl {
   const framework::Tensor& x;
   framework::Tensor* y;
   std::vector<int> origin_reduce_dims;
   const double& init;
-  const ReduceOp& reducer;
   gpuStream_t stream;
   TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
                           std::vector<int> origin_reduce_dims,
-                          const double& init, const ReduceOp& reducer,
-                          gpuStream_t stream)
+                          const double& init, gpuStream_t stream)
       : x(x),
         y(y),
         origin_reduce_dims(origin_reduce_dims),
         init(init),
-        reducer(reducer),
         stream(stream) {}
 
   template <typename Ty>
 
   void apply() const {
     const Ty& init_cast = static_cast<Ty>(init);
-    TensorReduceFunc<Tx, Ty, ReduceOp, TransformOp<Ty, Ty>,
+    TensorReduceFunc<Tx, Ty, ReduceOp<Ty>, TransformOp<Ty, Ty>,
                      TransformOp<Tx, Ty>>(x, y, origin_reduce_dims, init_cast,
-                                          reducer, TransformOp<Ty, Ty>(),
+                                          ReduceOp<Ty>(), TransformOp<Ty, Ty>(),
                                           TransformOp<Tx, Ty>(), stream);
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 0d309e278a387..6ae62ba2c49ef 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -53,9 +53,8 @@ class ReduceProdKernel : public framework::OpKernel<T> {
     if (out_dtype >= 0) {
       framework::VisitDataTypeSmall(
           static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctorImpl<T, cub::Sum, detail::IdentityFunctor>(
-              *input, output, reduce_dims, static_cast<double>(1.0f),
-              cub::Sum(), stream));
+          TensorReduceFunctorImpl<T, CustomMul, detail::IdentityFunctor>(
+              *input, output, reduce_dims, static_cast<double>(1.0f), stream));
     } else {
       TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
           *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),

From cb2b619e133c293366ace126703af00a36701af0 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 9 Jun 2021 03:16:17 +0000
Subject: [PATCH 24/34] modified max min prod for cu.h

---
 .../operators/reduce_ops/reduce_functor_op.h  | 112 ++++++++-----
 .../operators/reduce_ops/reduce_max_op.cu     |  21 +--
 .../operators/reduce_ops/reduce_min_op.cu     |  21 +--
 .../{reduce_op.cuh => reduce_op.cu.h}         | 154 ++++++++----------
 .../operators/reduce_ops/reduce_prod_op.cu    |  31 +---
 5 files changed, 158 insertions(+), 181 deletions(-)
 rename paddle/fluid/operators/reduce_ops/{reduce_op.cuh => reduce_op.cu.h} (85%)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 1fcfe2bd9672a..944097ef0e1ab 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -13,83 +13,117 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <float.h>
-#include <limits.h>
 #include <math.h>
-#include <cstdint>
+#include <limits>
+
+#ifdef __HIPCC__
+#include <hip/hip_runtime.h>
+#endif
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct DataBound {
-  static inline T max() { return static_cast<T>(FLT_MAX); }
-  static inline T min() { return static_cast<T>(-FLT_MAX); }
-};
+// Post processing function for sum, max, min, prod, any
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+  __device__ explicit inline IdentityFunctor() {}
 
-template <>
-struct DataBound<float> {
-  static inline float max() { return FLT_MAX; }
-  static inline float min() { return -FLT_MAX; }
-};
+  __device__ explicit inline IdentityFunctor(int n) {}
 
-template <>
-struct DataBound<double> {
-  static inline double max() { return DBL_MAX; }
-  static inline double min() { return -DBL_MAX; }
+  __device__ inline Ty operator()(const Tx &x) const {
+    return static_cast<Ty>(x);
+  }
 };
 
-template <>
-struct DataBound<int32_t> {
-  static inline int32_t max() { return INT32_MAX; }
-  static inline int32_t min() { return INT32_MIN; }
-};
+// Post processing function for mean
+template <typename T>
+struct DivideFunctor {
+  __device__ explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  __device__ inline T operator()(const T &x) const { return x * n_inv; }
 
-template <>
-struct DataBound<int64_t> {
-  static inline int64_t max() { return INT64_MAX; }
-  static inline int64_t min() { return INT64_MIN; }
+ private:
+  T n_inv;
 };
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomMin {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() {
+    return std::numeric_limits<Ty>::max();
+  }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return (b < a) ? b : a;
   }
 };
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomMax {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() {
+    return std::numeric_limits<Ty>::min();
+  }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return (b > a) ? b : a;
   }
 };
 
-template <typename T>
+// for cub::Reduce
+template <typename Tx, typename Ty = Tx>
 struct CustomSum {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx, Ty>;
+
+  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b + a;
   }
 };
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
+struct CustomMean {
+  using Transformer = DivideFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
+    return b + a;
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
 struct CustomMul {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b * a;
   }
 };
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomLogicalOr {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(false); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b || a;
   }
 };
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct CustomLogicalAnd {
-  __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+  using Transformer = IdentityFunctor<Tx>;
+
+  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(true); }
+
+  __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b && a;
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index db65eeeb27394..d8b4f13a7442c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
 namespace paddle {
@@ -26,26 +26,13 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto* input = context.Input<Tensor>("X");
     auto* output = context.Output<Tensor>("Out");
-
     auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
 
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) {
-        reduce_dims[i] = i;
-      }
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
+    std::vector<int> reduce_dims =
+        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),
-        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
+    TensorReduceFunc<T, T, CustomMax>(*input, output, reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index b5f5a021b379b..b81dba15b5da4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
 namespace paddle {
@@ -26,26 +26,13 @@ class ReduceMinKernel : public framework::OpKernel<T> {
     bool reduce_all = context.Attr<bool>("reduce_all");
     auto* input = context.Input<Tensor>("X");
     auto* output = context.Output<Tensor>("Out");
-
     auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
 
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) {
-        reduce_dims[i] = i;
-      }
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
+    std::vector<int> reduce_dims =
+        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
 
     auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
-        *input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),
-        detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
+    TensorReduceFunc<T, T, CustomMin>(*input, output, reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cuh b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
similarity index 85%
rename from paddle/fluid/operators/reduce_ops/reduce_op.cuh
rename to paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index c6b88a636b646..4e6b0d19ef8a4 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cuh
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -32,32 +32,29 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#define BOUND 512
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+
+#define BOUNDARY 512  // Reduce split or not, Whether to use ReduceHigherDim
 
 namespace paddle {
 namespace operators {
 namespace detail {
 
-// Post processing function for sum, max, min, prod, any
-template <typename Tx, typename Ty = Tx>
-struct IdentityFunctor {
-  HOSTDEVICE explicit inline IdentityFunctor() {}
-
-  HOSTDEVICE inline Ty operator()(const Tx& x) const {
-    return static_cast<Ty>(x);
+static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
+                                            int dim_size, bool reduce_all) {
+  std::vector<int> reduce_dims;
+  if (reduce_all) {
+    reduce_dims.resize(dim_size);
+    for (int i = 0; i < reduce_dims.size(); ++i) {
+      reduce_dims[i] = i;
+    }
+  } else {
+    for (auto e : dims) {
+      reduce_dims.push_back(e >= 0 ? e : e + dim_size);
+    }
   }
-};
-
-// Post processing function for mean
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
+  return reduce_dims;
+}
 
 static inline int GetLastPow2(int n) {
   n |= (n >> 1);
@@ -289,7 +286,7 @@ struct ReduceConfig {
   void SetReduceType() {
     int rank = x_dim.size();
     int reduce_rank = reduce_dim.size();
-    bool is_large_enough = reduce_num > BOUND / 2 && left_num > BOUND;
+    bool is_large_enough = (reduce_num > BOUNDARY / 2) || (left_num > BOUNDARY);
     if (rank == reduce_rank) {
       reduce_type = static_cast<int>(ReduceType::kReduceAll);
 
@@ -337,7 +334,7 @@ struct ReduceConfig {
       // init
       int num_block = (max_threads / left_num);
 
-      if (num_block > 1 && reduce_num >= BOUND) {
+      if (num_block > 1 && reduce_num >= BOUNDARY) {
         blocking_size = detail::GetLastPow2(reduce_num / num_block);
 
         if (blocking_size <= 1) {
@@ -394,14 +391,15 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim>
 __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
                                               ReduceOp reducer,
-                                              TransformOp transformer, Ty init,
+                                              TransformOp transformer,
                                               int reduce_num) {
   __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
   int idx_x = blockIdx.x * reduce_num;
   int idx_y = threadIdx.x;
-  Ty reduce_var = init;
+  Ty reduce_var = reducer.initial();
   for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) {
-    reduce_var = reducer(reduce_var, static_cast<Ty>(x[idx_x + idx_y]));
+    reduce_var =
+        reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
   }
   __syncthreads();
 
@@ -409,7 +407,7 @@ __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
       cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
 
   if (threadIdx.x == 0) {
-    y[blockIdx.x] = transformer(reduce_var);
+    y[blockIdx.x] = reduce_var;
   }
 }
 
@@ -422,13 +420,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
 __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
                                                 ReduceOp reducer,
                                                 TransformOp transformer,
-                                                Ty init, int reduce_num,
-                                                int left_num, int block_size) {
+                                                int reduce_num, int left_num,
+                                                int block_size) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   int idy = blockIdx.y * block_size;
 
-  Ty temp = init;
-  Ty reduce_var = init;
+  Ty reduce_var = reducer.initial();
 
   if (idx < left_num) {
     int loop = reduce_num - idy;
@@ -436,11 +433,11 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
 
     for (int iy = 0; iy < loop; iy++) {
       int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num;
-      reduce_var = reducer(reduce_var, static_cast<Ty>(x[id]));
+      reduce_var = reducer(reduce_var, static_cast<Ty>(transformer(x[id])));
     }
 
     y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] =
-        static_cast<Ty>(transformer(reduce_var));
+        reduce_var;
   }
 }
 
@@ -450,7 +447,7 @@ __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank>
 __device__ __forceinline__ void ReduceAny(
-    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
     int reduce_num, paddle::framework::Array<int, Rank> x_strides,
     paddle::framework::Array<int, ReduceRank> reduce_dim,
     paddle::framework::Array<int, ReduceRank> reduce_strides,
@@ -490,8 +487,8 @@ __device__ __forceinline__ void ReduceAny(
       idx_x += (sub_index[k] * x_strides[k]);
     }
 
-    reduce_var =
-        static_cast<Ty>(reducer(reduce_var, static_cast<Ty>(x[idx_x])));
+    reduce_var = static_cast<Ty>(
+        reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x]))));
   }
   __syncthreads();
 
@@ -499,7 +496,7 @@ __device__ __forceinline__ void ReduceAny(
       cub::BlockReduce<Ty, BlockDim>(temp_storage).Reduce(reduce_var, reducer);
 
   if (threadIdx.x == 0) {
-    y[blockIdx.x] = transformer(reduce_var);
+    y[blockIdx.x] = reduce_var;
   }
 }
 
@@ -507,7 +504,7 @@ __device__ __forceinline__ void ReduceAny(
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank, int ReduceType>
 __device__ __forceinline__ void ReduceModule(
-    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
     int reduce_num, int left_num, int blocking_size,
     paddle::framework::Array<int, Rank> x_strides,
     paddle::framework::Array<int, ReduceRank> reduce_dim,
@@ -517,17 +514,17 @@ __device__ __forceinline__ void ReduceModule(
   // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
   if (ReduceType == ReduceType::kReduceLastDim) {
     ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
-        x, y, reducer, transformer, init, reduce_num);
+        x, y, reducer, transformer, reduce_num);
 
     // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
   } else if (ReduceType == ReduceType::kReduceHigherDim) {
     ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
-        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
+        x, y, reducer, transformer, reduce_num, left_num, blocking_size);
 
     // reduce_rank >= 2
   } else {
     ReduceAny<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank>(
-        x, y, reducer, transformer, init, reduce_num, x_strides, reduce_dim,
+        x, y, reducer, transformer, reduce_num, x_strides, reduce_dim,
         reduce_strides, left_dim, left_strides);
   }
 }
@@ -535,7 +532,7 @@ __device__ __forceinline__ void ReduceModule(
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank, int ReduceType>
 __global__ void ReduceKernelFunction(
-    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
     int reduce_num, int left_num, int block_size,
     paddle::framework::Array<int, Rank> x_strides,
     paddle::framework::Array<int, ReduceRank> reduce_dim,
@@ -543,24 +540,24 @@ __global__ void ReduceKernelFunction(
     paddle::framework::Array<int, Rank - ReduceRank> left_dim,
     paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
   ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank,
-               ReduceType>(x, y, reducer, transformer, init, reduce_num,
-                           left_num, block_size, x_strides, reduce_dim,
-                           reduce_strides, left_dim, left_strides);
+               ReduceType>(x, y, reducer, transformer, reduce_num, left_num,
+                           block_size, x_strides, reduce_dim, reduce_strides,
+                           left_dim, left_strides);
 }
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp, int kRank, int kReduceRank>
 static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
-                         const TransformOp& transformer, const Ty& init,
-                         gpuStream_t stream, ReduceConfig<Ty> config) {
+                         const TransformOp& transformer, gpuStream_t stream,
+                         ReduceConfig<Ty> config) {
 #define CUB_REDUCE_TYPE_CASE(type)                                             \
   case type: {                                                                 \
     constexpr auto kReduceType = type;                                         \
     ReduceKernelFunction<                                                      \
         Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,           \
         kReduceType><<<config.grid, config.block, 0, stream>>>(                \
-        x_data, config.output_data, reducer, transformer, init,                \
-        config.reduce_num, config.left_num, config.blocking_size,              \
+        x_data, config.output_data, reducer, transformer, config.reduce_num,   \
+        config.left_num, config.blocking_size,                                 \
         detail::VectorToArray<int, kRank>(config.x_strides),                   \
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),            \
         detail::VectorToArray<int, kReduceRank>(config.reduce_strides),        \
@@ -579,10 +576,10 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
     dim3 grid(config.grid.x, 1, config.grid.z);
 
     ReduceKernelFunction<
-        Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
+        Ty, Ty, ReduceOp, IdentityFunctor<Ty>, 128, kRank, kReduceRank,
         ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
-        config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(),
-        init, config.grid.y, config.left_num, config.grid.y,
+        config.output_data, y_data, reducer, IdentityFunctor<Ty>(),
+        config.grid.y, config.left_num, config.grid.y,
         detail::VectorToArray<int, kRank>(config.x_strides),
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
         detail::VectorToArray<int, kReduceRank>(config.reduce_strides),
@@ -595,7 +592,7 @@ template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp>
 static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
                                const ReduceOp& reducer,
-                               const TransformOp& transformer, const Ty& init,
+                               const TransformOp& transformer,
                                gpuStream_t stream, ReduceConfig<Ty> config) {
   int reduce_rank = config.reduce_strides.size();
   int rank = config.x_strides.size();
@@ -610,7 +607,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
   case i: {                                                                    \
     constexpr auto kReduceRank = i;                                            \
     LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
-        x_data, y_data, reducer, transformer, init, stream, config);           \
+        x_data, y_data, reducer, transformer, stream, config);                 \
   } break
 
   detail::CheckReduceRank(reduce_rank, rank);
@@ -636,13 +633,10 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
 #undef CUB_RANK_CASE
 }
 
-template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
-          typename CubTransformOp = TransformOp>
+template <typename Tx, typename Ty,
+          template <typename, typename> class ReduceOp>
 void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
-                      std::vector<int> origin_reduce_dims, const Ty& init,
-                      const ReduceOp& reducer, const TransformOp& transformer,
-                      const CubTransformOp& cub_transformer,
-                      gpuStream_t stream) {
+                      std::vector<int> origin_reduce_dims, gpuStream_t stream) {
   auto x_dim = framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
   config.Run();  // get the parameters of LaunchReduceKernel
@@ -662,28 +656,33 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
     y->Resize(out_dims);
     return;
   }
+  using TransformOp = typename ReduceOp<Tx, Ty>::Transformer;
+  auto reducer = ReduceOp<Tx, Ty>();
   // launch CUB::Reduce
   if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
-    cub::TransformInputIterator<Ty, CubTransformOp, const Tx*> trans_x(
-        x_data, cub_transformer);
+    cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
+        x_data, TransformOp(config.reduce_num));
     size_t temp_storage_bytes = 0;
     cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
+                              config.reduce_num, reducer, reducer.initial(),
+                              stream);
     framework::Tensor tmp;
     auto* temp_storage = tmp.mutable_data<uint8_t>(
         framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
         x.place());
     cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
-                              config.reduce_num, reducer, init, stream);
+                              config.reduce_num, reducer, reducer.initial(),
+                              stream);
 
     return;
   }
 
-#define CUB_BLOCK_DIM_CASE(block_dim)                                \
-  case block_dim: {                                                  \
-    constexpr auto kBlockDim = block_dim;                            \
-    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>(    \
-        x_data, y_data, reducer, transformer, init, stream, config); \
+#define CUB_BLOCK_DIM_CASE(block_dim)                                     \
+  case block_dim: {                                                       \
+    constexpr auto kBlockDim = block_dim;                                 \
+    LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp<Tx, Ty>, TransformOp>( \
+        x_data, y_data, reducer, TransformOp(config.reduce_num), stream,  \
+        config);                                                          \
   } break
 
   switch (detail::GetBlockDim(config.reduce_num)) {
@@ -699,31 +698,20 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 #undef CUB_BLOCK_DIM_CASE
 }
 
-template <typename Tx, template <typename> class ReduceOp,
-          template <typename, typename> class TransformOp>
+template <typename Tx, template <typename, typename> class ReduceOp>
 struct TensorReduceFunctorImpl {
   const framework::Tensor& x;
   framework::Tensor* y;
   std::vector<int> origin_reduce_dims;
-  const double& init;
   gpuStream_t stream;
   TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
                           std::vector<int> origin_reduce_dims,
-                          const double& init, gpuStream_t stream)
-      : x(x),
-        y(y),
-        origin_reduce_dims(origin_reduce_dims),
-        init(init),
-        stream(stream) {}
+                          gpuStream_t stream)
+      : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {}
 
   template <typename Ty>
-
   void apply() const {
-    const Ty& init_cast = static_cast<Ty>(init);
-    TensorReduceFunc<Tx, Ty, ReduceOp<Ty>, TransformOp<Ty, Ty>,
-                     TransformOp<Tx, Ty>>(x, y, origin_reduce_dims, init_cast,
-                                          ReduceOp<Ty>(), TransformOp<Ty, Ty>(),
-                                          TransformOp<Tx, Ty>(), stream);
+    TensorReduceFunc<Tx, Ty, ReduceOp>(x, y, origin_reduce_dims, stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 6ae62ba2c49ef..1e996dcaea5be 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.cuh"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
 namespace paddle {
@@ -28,37 +27,19 @@ class ReduceProdKernel : public framework::OpKernel<T> {
     auto* input = context.Input<Tensor>("X");
     auto* output = context.Output<Tensor>("Out");
     auto out_dtype = context.Attr<int>("out_dtype");
-
     auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
 
-    std::vector<int> reduce_dims;
-    if (reduce_all) {
-      reduce_dims.resize(input->dims().size());
-      for (int i = 0; i < reduce_dims.size(); ++i) {
-        reduce_dims[i] = i;
-      }
-    } else {
-      for (auto e : dims) {
-        reduce_dims.push_back(e >= 0 ? e : e + input->dims().size());
-      }
-    }
-
-    int reduce_num = 1;
-    for (int i = 0; i < reduce_dims.size(); ++i) {
-      reduce_num *= input->dims()[reduce_dims[i]];
-    }
+    std::vector<int> reduce_dims =
+        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
 
     auto stream = context.cuda_device_context().stream();
     if (out_dtype >= 0) {
       framework::VisitDataTypeSmall(
           static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctorImpl<T, CustomMul, detail::IdentityFunctor>(
-              *input, output, reduce_dims, static_cast<double>(1.0f), stream));
+          TensorReduceFunctorImpl<T, CustomMul>(*input, output, reduce_dims,
+                                                stream));
     } else {
-      TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
-          *input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
-          detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
+      TensorReduceFunc<T, T, CustomMul>(*input, output, reduce_dims, stream);
     }
   }
 };

From 6541ffbf330635cdae0b9cf0dbd0278f3984bd55 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 9 Jun 2021 16:33:42 +0000
Subject: [PATCH 25/34] update for struct

---
 .../operators/reduce_ops/reduce_functor_op.h  | 38 ++++---------------
 .../fluid/operators/reduce_ops/reduce_op.cu.h | 28 ++++++++++++--
 2 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 944097ef0e1ab..8fb221e86f2bc 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <limits>
 
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #ifdef __HIPCC__
 #include <hip/hip_runtime.h>
 #endif
@@ -23,32 +24,9 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// Post processing function for sum, max, min, prod, any
-template <typename Tx, typename Ty = Tx>
-struct IdentityFunctor {
-  __device__ explicit inline IdentityFunctor() {}
-
-  __device__ explicit inline IdentityFunctor(int n) {}
-
-  __device__ inline Ty operator()(const Tx &x) const {
-    return static_cast<Ty>(x);
-  }
-};
-
-// Post processing function for mean
-template <typename T>
-struct DivideFunctor {
-  __device__ explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
-
-  __device__ inline T operator()(const T &x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
 template <typename Tx, typename Ty = Tx>
 struct CustomMin {
-  using Transformer = IdentityFunctor<Tx>;
+  using Transformer = detail::IdentityFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() {
     return std::numeric_limits<Ty>::max();
@@ -61,7 +39,7 @@ struct CustomMin {
 
 template <typename Tx, typename Ty = Tx>
 struct CustomMax {
-  using Transformer = IdentityFunctor<Tx>;
+  using Transformer = detail::IdentityFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() {
     return std::numeric_limits<Ty>::min();
@@ -75,7 +53,7 @@ struct CustomMax {
 // for cub::Reduce
 template <typename Tx, typename Ty = Tx>
 struct CustomSum {
-  using Transformer = IdentityFunctor<Tx, Ty>;
+  using Transformer = detail::IdentityFunctor<Tx, Ty>;
 
   __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
 
@@ -86,7 +64,7 @@ struct CustomSum {
 
 template <typename Tx, typename Ty = Tx>
 struct CustomMean {
-  using Transformer = DivideFunctor<Tx>;
+  using Transformer = detail::DivideFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
 
@@ -97,7 +75,7 @@ struct CustomMean {
 
 template <typename Tx, typename Ty = Tx>
 struct CustomMul {
-  using Transformer = IdentityFunctor<Tx>;
+  using Transformer = detail::IdentityFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
 
@@ -108,7 +86,7 @@ struct CustomMul {
 
 template <typename Tx, typename Ty = Tx>
 struct CustomLogicalOr {
-  using Transformer = IdentityFunctor<Tx>;
+  using Transformer = detail::IdentityFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() { return static_cast<Ty>(false); }
 
@@ -119,7 +97,7 @@ struct CustomLogicalOr {
 
 template <typename Tx, typename Ty = Tx>
 struct CustomLogicalAnd {
-  using Transformer = IdentityFunctor<Tx>;
+  using Transformer = detail::IdentityFunctor<Tx>;
 
   __device__ __forceinline__ Ty initial() { return static_cast<Ty>(true); }
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 4e6b0d19ef8a4..31f7820dc12d2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -32,7 +32,6 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 
 #define BOUNDARY 512  // Reduce split or not, Whether to use ReduceHigherDim
 
@@ -40,6 +39,29 @@ namespace paddle {
 namespace operators {
 namespace detail {
 
+// Post processing function for sum, max, min, prod, any
+template <typename Tx, typename Ty = Tx>
+struct IdentityFunctor {
+
+  HOSTDEVICE explicit inline IdentityFunctor(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx &x) const {
+    return static_cast<Ty>(x);
+  }
+};
+
+// Post processing function for mean
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T &x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+
 static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
                                             int dim_size, bool reduce_all) {
   std::vector<int> reduce_dims;
@@ -576,9 +598,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
     dim3 grid(config.grid.x, 1, config.grid.z);
 
     ReduceKernelFunction<
-        Ty, Ty, ReduceOp, IdentityFunctor<Ty>, 128, kRank, kReduceRank,
+        Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
         ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
-        config.output_data, y_data, reducer, IdentityFunctor<Ty>(),
+        config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(config.grid.y),
         config.grid.y, config.left_num, config.grid.y,
         detail::VectorToArray<int, kRank>(config.x_strides),
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),

From 719e435fc22372ca04bec30c5ba791638d232b39 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 9 Jun 2021 16:39:31 +0000
Subject: [PATCH 26/34] code style reduce_op.cu.h

---
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 31f7820dc12d2..26925ac2c4e80 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -42,10 +42,9 @@ namespace detail {
 // Post processing function for sum, max, min, prod, any
 template <typename Tx, typename Ty = Tx>
 struct IdentityFunctor {
-
   HOSTDEVICE explicit inline IdentityFunctor(int n) {}
 
-  HOSTDEVICE inline Ty operator()(const Tx &x) const {
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
     return static_cast<Ty>(x);
   }
 };
@@ -55,13 +54,12 @@ template <typename T>
 struct DivideFunctor {
   HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {}
 
-  HOSTDEVICE inline T operator()(const T &x) const { return x * n_inv; }
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
 
  private:
   T n_inv;
 };
 
-
 static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
                                             int dim_size, bool reduce_all) {
   std::vector<int> reduce_dims;
@@ -600,8 +598,9 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
     ReduceKernelFunction<
         Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
         ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
-        config.output_data, y_data, reducer, detail::IdentityFunctor<Ty>(config.grid.y),
-        config.grid.y, config.left_num, config.grid.y,
+        config.output_data, y_data, reducer,
+        detail::IdentityFunctor<Ty>(config.grid.y), config.grid.y,
+        config.left_num, config.grid.y,
         detail::VectorToArray<int, kRank>(config.x_strides),
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
         detail::VectorToArray<int, kReduceRank>(config.reduce_strides),

From 5045a4919ad3fd1d4f3b2e1e84287297f2e18e1b Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Thu, 10 Jun 2021 06:18:17 +0000
Subject: [PATCH 27/34] device to HOSTDEVICE

---
 .../fluid/operators/reduce_ops/reduce_functor_op.h | 14 +++++++-------
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h   |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 944097ef0e1ab..0f06dc93db43a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -50,7 +50,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMin {
   using Transformer = IdentityFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() {
+  HOSTDEVICE __forceinline__ Ty initial() {
     return std::numeric_limits<Ty>::max();
   }
 
@@ -63,7 +63,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMax {
   using Transformer = IdentityFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() {
+  HOSTDEVICE __forceinline__ Ty initial() {
     return std::numeric_limits<Ty>::min();
   }
 
@@ -77,7 +77,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomSum {
   using Transformer = IdentityFunctor<Tx, Ty>;
 
-  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b + a;
@@ -88,7 +88,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMean {
   using Transformer = DivideFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b + a;
@@ -99,7 +99,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMul {
   using Transformer = IdentityFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
+  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b * a;
@@ -110,7 +110,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomLogicalOr {
   using Transformer = IdentityFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(false); }
+  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(false); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b || a;
@@ -121,7 +121,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomLogicalAnd {
   using Transformer = IdentityFunctor<Tx>;
 
-  __device__ __forceinline__ Ty initial() { return static_cast<Ty>(true); }
+  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(true); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b && a;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 4e6b0d19ef8a4..9e0a4e00d9d7f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -472,7 +472,7 @@ __device__ __forceinline__ void ReduceAny(
   for (int k = 0; k < Rank; ++k) {
     idx_x += (sub_index[k] * x_strides[k]);
   }
-  Ty reduce_var = static_cast<Ty>(x[idx_x]);
+  Ty reduce_var = static_cast<Ty>(transformer(x[idx_x]));
 
   for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) {
     int reduce_idx = i;

From fb69e3d41b8e89d50c9366ccd0fa99457afc6641 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Tue, 15 Jun 2021 07:01:13 +0000
Subject: [PATCH 28/34] ReduceCudaKernel

---
 .../operators/reduce_ops/reduce_functor_op.h  |  2 +-
 .../operators/reduce_ops/reduce_max_op.cu     | 33 +++----------
 .../operators/reduce_ops/reduce_min_op.cu     | 33 +++----------
 .../fluid/operators/reduce_ops/reduce_op.cu.h | 28 +++++++++++
 .../operators/reduce_ops/reduce_prod_op.cu    | 47 ++++---------------
 5 files changed, 51 insertions(+), 92 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 9b10286239bb7..9cd3e1197af24 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -42,7 +42,7 @@ struct CustomMax {
   using Transformer = detail::IdentityFunctor<Tx>;
 
   HOSTDEVICE __forceinline__ Ty initial() {
-    return std::numeric_limits<Ty>::min();
+    return std::numeric_limits<Ty>::lowest();
   }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
index d8b4f13a7442c..f214fcba199a3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu
@@ -11,34 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceMaxKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto dims = context.Attr<std::vector<int>>("dim");
-
-    std::vector<int> reduce_dims =
-        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomMax>(*input, output, reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_max, ops::ReduceMaxKernel<float>,
-                        ops::ReduceMaxKernel<double>, ops::ReduceMaxKernel<int>,
-                        ops::ReduceMaxKernel<int64_t>);
+// reduce_max
+REGISTER_OP_CUDA_KERNEL(
+    reduce_max, ops::ReduceCudaKernel<float, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMax>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMax>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
index b81dba15b5da4..7806df284d8c0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu
@@ -11,34 +11,13 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceMinKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto dims = context.Attr<std::vector<int>>("dim");
-
-    std::vector<int> reduce_dims =
-        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
-
-    auto stream = context.cuda_device_context().stream();
-    TensorReduceFunc<T, T, CustomMin>(*input, output, reduce_dims, stream);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(reduce_min, ops::ReduceMinKernel<float>,
-                        ops::ReduceMinKernel<double>, ops::ReduceMinKernel<int>,
-                        ops::ReduceMinKernel<int64_t>);
+// reduce_min
+REGISTER_OP_CUDA_KERNEL(
+    reduce_min, ops::ReduceCudaKernel<float, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMin>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMin>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 63b209eeefa6b..476f4b8fb7b77 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
@@ -150,6 +151,8 @@ static inline paddle::framework::Array<T, ElementCount> VectorToArray(
 
 }  // namespace detail
 
+using Tensor = framework::Tensor;
+
 enum ReduceType {
   kReduceAll = 0x00,        // when reduce_rank == x_rank
   kReduceLastDim = 0x01,    // when reduce_dim[0] == x_dim.size() - 1;
@@ -736,5 +739,30 @@ struct TensorReduceFunctorImpl {
   }
 };
 
+template <typename T, template <typename, typename> class ReduceOp>
+class ReduceCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    const Tensor* input = context.Input<Tensor>("X");
+    Tensor* output = context.Output<Tensor>("Out");
+    auto out_dtype = context.Attr<int>("out_dtype");
+    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
+
+    std::vector<int> reduce_dims =
+        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
+
+    gpuStream_t stream = context.cuda_device_context().stream();
+    if (out_dtype >= 0) {
+      framework::VisitDataTypeSmall(
+          static_cast<framework::proto::VarType::Type>(out_dtype),
+          TensorReduceFunctorImpl<T, ReduceOp>(*input, output, reduce_dims,
+                                               stream));
+    } else {
+      TensorReduceFunc<T, T, ReduceOp>(*input, output, reduce_dims, stream);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 1e996dcaea5be..9232a32ee55f3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -16,46 +16,19 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class ReduceProdKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    auto out_dtype = context.Attr<int>("out_dtype");
-    auto dims = context.Attr<std::vector<int>>("dim");
-
-    std::vector<int> reduce_dims =
-        detail::GetReduceDim(dims, input->dims().size(), reduce_all);
-
-    auto stream = context.cuda_device_context().stream();
-    if (out_dtype >= 0) {
-      framework::VisitDataTypeSmall(
-          static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctorImpl<T, CustomMul>(*input, output, reduce_dims,
-                                                stream));
-    } else {
-      TensorReduceFunc<T, T, CustomMul>(*input, output, reduce_dims, stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
+// reduce_prod
 
 #ifdef __HIPCC__
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)
-REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
-                        ops::ReduceProdKernel<int>,
-                        ops::ReduceProdKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
 #else
-REGISTER_OP_CUDA_KERNEL(reduce_prod, ops::ReduceProdKernel<float>,
-                        ops::ReduceProdKernel<int>,
-                        ops::ReduceProdKernel<double>,
-                        ops::ReduceProdKernel<int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_prod, ops::ReduceCudaKernel<float, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<double, paddle::operators::CustomMul>,
+    ops::ReduceCudaKernel<int64_t, paddle::operators::CustomMul>);
 #endif

From b841b340e0b86b380fd1636a38e54a486c1ad410 Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Tue, 15 Jun 2021 07:12:43 +0000
Subject: [PATCH 29/34] REDUCE_SPLIT_BOUNDARY

---
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 476f4b8fb7b77..5e9b4bff34396 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -34,7 +34,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
-#define BOUNDARY 512  // Reduce split or not, Whether to use ReduceHigherDim
+#define REDUCE_SPLIT_BOUNDARY \
+  512  // Reduce split or not, Whether to use ReduceHigherDim
 
 namespace paddle {
 namespace operators {
@@ -309,7 +310,8 @@ struct ReduceConfig {
   void SetReduceType() {
     int rank = x_dim.size();
     int reduce_rank = reduce_dim.size();
-    bool is_large_enough = (reduce_num > BOUNDARY / 2) || (left_num > BOUNDARY);
+    bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) ||
+                           (left_num > REDUCE_SPLIT_BOUNDARY);
     if (rank == reduce_rank) {
       reduce_type = static_cast<int>(ReduceType::kReduceAll);
 
@@ -357,7 +359,7 @@ struct ReduceConfig {
       // init
       int num_block = (max_threads / left_num);
 
-      if (num_block > 1 && reduce_num >= BOUNDARY) {
+      if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
         blocking_size = detail::GetLastPow2(reduce_num / num_block);
 
         if (blocking_size <= 1) {

From 1fda4d5364452143072808f4f004752e33e7ac01 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 15 Jun 2021 17:41:01 +0800
Subject: [PATCH 30/34] Update reduce_op.cu.h

---
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 5e9b4bff34396..98ee5969c509d 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -34,8 +34,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 
-#define REDUCE_SPLIT_BOUNDARY \
-  512  // Reduce split or not, Whether to use ReduceHigherDim
+// Reduce split or not, Whether to use ReduceHigherDim
+#define REDUCE_SPLIT_BOUNDARY 512  
 
 namespace paddle {
 namespace operators {

From c85ca05c3798964e956128c41ac155911e6568ce Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 16 Jun 2021 06:27:08 +0000
Subject: [PATCH 31/34] rename reduceTensorFunctor

---
 .../operators/reduce_ops/reduce_functor_op.h  |  8 ++--
 .../fluid/operators/reduce_ops/reduce_op.cu.h | 42 ++++++++++---------
 .../operators/reduce_ops/reduce_prod_op.cu    |  1 -
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 9cd3e1197af24..09ecbbc9e0f59 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <math.h>
+#include <cmath>
 #include <limits>
-
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#include "paddle/fluid/platform/hostdevice.h"
 #ifdef __HIPCC__
 #include <hip/hip_runtime.h>
 #endif
@@ -29,7 +29,7 @@ struct CustomMin {
   using Transformer = detail::IdentityFunctor<Tx>;
 
   HOSTDEVICE __forceinline__ Ty initial() {
-    return std::numeric_limits<Ty>::max();
+    return static_cast<Ty>(std::numeric_limits<Ty>::max());
   }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
@@ -42,7 +42,7 @@ struct CustomMax {
   using Transformer = detail::IdentityFunctor<Tx>;
 
   HOSTDEVICE __forceinline__ Ty initial() {
-    return std::numeric_limits<Ty>::lowest();
+    return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
   }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 98ee5969c509d..000933a78f035 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -35,7 +35,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/framework/tensor_util.h"
 
 // Reduce split or not, Whether to use ReduceHigherDim
-#define REDUCE_SPLIT_BOUNDARY 512  
+#define REDUCE_SPLIT_BOUNDARY 512
 
 namespace paddle {
 namespace operators {
@@ -72,6 +72,11 @@ static inline std::vector<int> GetReduceDim(const std::vector<int>& dims,
     }
   } else {
     for (auto e : dims) {
+      PADDLE_ENFORCE_LT(e, dim_size,
+                        paddle::platform::errors::InvalidArgument(
+                            "ReduceOp: invalid axis, when x_dims is %d, "
+                            "axis[i] should less than x_dims, but got %d.",
+                            dim_size, e));
       reduce_dims.push_back(e >= 0 ? e : e + dim_size);
     }
   }
@@ -108,9 +113,7 @@ constexpr int kMaxThread = 128;
 
 // get blockDim for reduceLastDim and reduceAny
 static inline int GetBlockDim(int block_dim) {
-  return block_dim >= kMaxThread
-             ? kMaxThread
-             : (1 << static_cast<int>(std::log2(block_dim)));
+  return block_dim >= kMaxThread ? kMaxThread : GetLastPow2(block_dim);
 }
 
 // check reduce rand is valid
@@ -220,11 +223,11 @@ struct ReduceConfig {
 
     if (reduce_dim_temp.size() > 1) {
       for (int i = 1; i < x_dim.size(); i++) {
-        if (idx_reduce < reduce_dim_temp.size() &&
-            i == reduce_dim_temp[idx_reduce]) {
+        if ((idx_reduce < reduce_dim_temp.size()) &&
+            (i == reduce_dim_temp[idx_reduce])) {
           int result =
               reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1];
-          bool is_equal = (result - num == 1);
+          bool is_equal = ((result - num) == 1);
           if (is_equal) {
             x_new_dim[x_new_dim.size() - 1] *= x_dim[i];
             num++;
@@ -248,7 +251,6 @@ struct ReduceConfig {
     std::vector<int> reduce_dim_new;
     int is_reduced = 0;
     for (auto e : reduce_dim) {
-      auto pos = e >= 0 ? e : e + x_dim.size();
       is_reduced |= 1 << e;
     }
 
@@ -312,6 +314,7 @@ struct ReduceConfig {
     int reduce_rank = reduce_dim.size();
     bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) ||
                            (left_num > REDUCE_SPLIT_BOUNDARY);
+
     if (rank == reduce_rank) {
       reduce_type = static_cast<int>(ReduceType::kReduceAll);
 
@@ -319,7 +322,7 @@ struct ReduceConfig {
       reduce_type = static_cast<int>(ReduceType::kReduceLastDim);
 
     } else if (reduce_rank == 1 &&
-               (rank == 2 && is_large_enough || rank != 2)) {
+               ((rank == 2 && is_large_enough) || rank != 2)) {
       // ReduceFirstDim and reduceSecondDim
       reduce_type = static_cast<int>(ReduceType::kReduceHigherDim);
 
@@ -661,8 +664,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
 
 template <typename Tx, typename Ty,
           template <typename, typename> class ReduceOp>
-void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
-                      std::vector<int> origin_reduce_dims, gpuStream_t stream) {
+void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
+                             std::vector<int> origin_reduce_dims,
+                             gpuStream_t stream) {
   auto x_dim = framework::vectorize<int>(x.dims());
   auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
   config.Run();  // get the parameters of LaunchReduceKernel
@@ -725,19 +729,18 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
 }
 
 template <typename Tx, template <typename, typename> class ReduceOp>
-struct TensorReduceFunctorImpl {
+struct TensorReduceFunctor {
   const framework::Tensor& x;
   framework::Tensor* y;
   std::vector<int> origin_reduce_dims;
   gpuStream_t stream;
-  TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
-                          std::vector<int> origin_reduce_dims,
-                          gpuStream_t stream)
+  TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y,
+                      std::vector<int> origin_reduce_dims, gpuStream_t stream)
       : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {}
 
   template <typename Ty>
   void apply() const {
-    TensorReduceFunc<Tx, Ty, ReduceOp>(x, y, origin_reduce_dims, stream);
+    TensorReduceFunctorImpl<Tx, Ty, ReduceOp>(x, y, origin_reduce_dims, stream);
   }
 };
 
@@ -758,10 +761,11 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     if (out_dtype >= 0) {
       framework::VisitDataTypeSmall(
           static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctorImpl<T, ReduceOp>(*input, output, reduce_dims,
-                                               stream));
+          TensorReduceFunctor<T, ReduceOp>(*input, output, reduce_dims,
+                                           stream));
     } else {
-      TensorReduceFunc<T, T, ReduceOp>(*input, output, reduce_dims, stream);
+      TensorReduceFunctorImpl<T, T, ReduceOp>(*input, output, reduce_dims,
+                                              stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
index 9232a32ee55f3..4f259e415d222 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
@@ -17,7 +17,6 @@
 #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
 
 // reduce_prod
-
 #ifdef __HIPCC__
 // Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
 // do not support double in HIPCC platform (Eigen3 to be fixed)

From 9cc8ac3fc70848b698709f5fc6b56c7a071f766e Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Wed, 16 Jun 2021 11:42:39 +0000
Subject: [PATCH 32/34] rename TensorReduceFunc

---
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 000933a78f035..bdbfbc01f6743 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -729,13 +729,13 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
 }
 
 template <typename Tx, template <typename, typename> class ReduceOp>
-struct TensorReduceFunctor {
+struct TensorReduceFunc {
   const framework::Tensor& x;
   framework::Tensor* y;
   std::vector<int> origin_reduce_dims;
   gpuStream_t stream;
-  TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y,
-                      std::vector<int> origin_reduce_dims, gpuStream_t stream)
+  TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
+                   std::vector<int> origin_reduce_dims, gpuStream_t stream)
       : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {}
 
   template <typename Ty>
@@ -761,8 +761,7 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
     if (out_dtype >= 0) {
       framework::VisitDataTypeSmall(
           static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctor<T, ReduceOp>(*input, output, reduce_dims,
-                                           stream));
+          TensorReduceFunc<T, ReduceOp>(*input, output, reduce_dims, stream));
     } else {
       TensorReduceFunctorImpl<T, T, ReduceOp>(*input, output, reduce_dims,
                                               stream);

From 140779d56b1afeb669fc30171a106f1e3363a89b Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Thu, 17 Jun 2021 05:14:05 +0000
Subject: [PATCH 33/34] delete HOSTDEVICE

---
 .../operators/reduce_ops/reduce_functor_op.h  | 14 +++----
 .../fluid/operators/reduce_ops/reduce_op.cu.h | 42 +++++++++----------
 2 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
index 09ecbbc9e0f59..0f02be21cc907 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h
@@ -28,7 +28,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMin {
   using Transformer = detail::IdentityFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() {
+  inline Ty initial() {
     return static_cast<Ty>(std::numeric_limits<Ty>::max());
   }
 
@@ -41,7 +41,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMax {
   using Transformer = detail::IdentityFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() {
+  inline Ty initial() {
     return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
   }
 
@@ -55,7 +55,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomSum {
   using Transformer = detail::IdentityFunctor<Tx, Ty>;
 
-  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b + a;
@@ -66,7 +66,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMean {
   using Transformer = detail::DivideFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(0.0f); }
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b + a;
@@ -77,7 +77,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomMul {
   using Transformer = detail::IdentityFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(1.0f); }
+  inline Ty initial() { return static_cast<Ty>(1.0f); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b * a;
@@ -88,7 +88,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomLogicalOr {
   using Transformer = detail::IdentityFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(false); }
+  inline Ty initial() { return static_cast<Ty>(false); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b || a;
@@ -99,7 +99,7 @@ template <typename Tx, typename Ty = Tx>
 struct CustomLogicalAnd {
   using Transformer = detail::IdentityFunctor<Tx>;
 
-  HOSTDEVICE __forceinline__ Ty initial() { return static_cast<Ty>(true); }
+  inline Ty initial() { return static_cast<Ty>(true); }
 
   __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const {
     return b && a;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index bdbfbc01f6743..4da57015bb105 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -419,12 +419,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim>
 __device__ __forceinline__ void ReduceLastDim(const Tx* x, Ty* y,
                                               ReduceOp reducer,
-                                              TransformOp transformer,
+                                              TransformOp transformer, Ty init,
                                               int reduce_num) {
   __shared__ typename cub::BlockReduce<Ty, BlockDim>::TempStorage temp_storage;
   int idx_x = blockIdx.x * reduce_num;
   int idx_y = threadIdx.x;
-  Ty reduce_var = reducer.initial();
+  Ty reduce_var = init;
   for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) {
     reduce_var =
         reducer(reduce_var, static_cast<Ty>(transformer(x[idx_x + idx_y])));
@@ -448,12 +448,12 @@ template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
 __device__ __forceinline__ void ReduceHigherDim(const Tx* x, Ty* y,
                                                 ReduceOp reducer,
                                                 TransformOp transformer,
-                                                int reduce_num, int left_num,
-                                                int block_size) {
+                                                Ty init, int reduce_num,
+                                                int left_num, int block_size) {
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   int idy = blockIdx.y * block_size;
 
-  Ty reduce_var = reducer.initial();
+  Ty reduce_var = init;
 
   if (idx < left_num) {
     int loop = reduce_num - idy;
@@ -532,7 +532,7 @@ __device__ __forceinline__ void ReduceAny(
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank, int ReduceType>
 __device__ __forceinline__ void ReduceModule(
-    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
     int reduce_num, int left_num, int blocking_size,
     paddle::framework::Array<int, Rank> x_strides,
     paddle::framework::Array<int, ReduceRank> reduce_dim,
@@ -542,12 +542,12 @@ __device__ __forceinline__ void ReduceModule(
   // reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
   if (ReduceType == ReduceType::kReduceLastDim) {
     ReduceLastDim<Tx, Ty, ReduceOp, TransformOp, BlockDim>(
-        x, y, reducer, transformer, reduce_num);
+        x, y, reducer, transformer, init, reduce_num);
 
     // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
   } else if (ReduceType == ReduceType::kReduceHigherDim) {
     ReduceHigherDim<Tx, Ty, ReduceOp, TransformOp>(
-        x, y, reducer, transformer, reduce_num, left_num, blocking_size);
+        x, y, reducer, transformer, init, reduce_num, left_num, blocking_size);
 
     // reduce_rank >= 2
   } else {
@@ -560,7 +560,7 @@ __device__ __forceinline__ void ReduceModule(
 template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
           int BlockDim, int Rank, int ReduceRank, int ReduceType>
 __global__ void ReduceKernelFunction(
-    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer,
+    const Tx* x, Ty* y, ReduceOp reducer, TransformOp transformer, Ty init,
     int reduce_num, int left_num, int block_size,
     paddle::framework::Array<int, Rank> x_strides,
     paddle::framework::Array<int, ReduceRank> reduce_dim,
@@ -568,24 +568,24 @@ __global__ void ReduceKernelFunction(
     paddle::framework::Array<int, Rank - ReduceRank> left_dim,
     paddle::framework::Array<int, Rank - ReduceRank> left_strides) {
   ReduceModule<Tx, Ty, ReduceOp, TransformOp, BlockDim, Rank, ReduceRank,
-               ReduceType>(x, y, reducer, transformer, reduce_num, left_num,
-                           block_size, x_strides, reduce_dim, reduce_strides,
-                           left_dim, left_strides);
+               ReduceType>(x, y, reducer, transformer, init, reduce_num,
+                           left_num, block_size, x_strides, reduce_dim,
+                           reduce_strides, left_dim, left_strides);
 }
 
 template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp, int kRank, int kReduceRank>
 static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
-                         const TransformOp& transformer, gpuStream_t stream,
-                         ReduceConfig<Ty> config) {
+                         const TransformOp& transformer, Ty init,
+                         gpuStream_t stream, ReduceConfig<Ty> config) {
 #define CUB_REDUCE_TYPE_CASE(type)                                             \
   case type: {                                                                 \
     constexpr auto kReduceType = type;                                         \
     ReduceKernelFunction<                                                      \
         Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank,           \
         kReduceType><<<config.grid, config.block, 0, stream>>>(                \
-        x_data, config.output_data, reducer, transformer, config.reduce_num,   \
-        config.left_num, config.blocking_size,                                 \
+        x_data, config.output_data, reducer, transformer, init,                \
+        config.reduce_num, config.left_num, config.blocking_size,              \
         detail::VectorToArray<int, kRank>(config.x_strides),                   \
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),            \
         detail::VectorToArray<int, kReduceRank>(config.reduce_strides),        \
@@ -607,7 +607,7 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
         Ty, Ty, ReduceOp, detail::IdentityFunctor<Ty>, 128, kRank, kReduceRank,
         ReduceType::kReduceHigherDim><<<grid, block, 0, stream>>>(
         config.output_data, y_data, reducer,
-        detail::IdentityFunctor<Ty>(config.grid.y), config.grid.y,
+        detail::IdentityFunctor<Ty>(config.grid.y), init, config.grid.y,
         config.left_num, config.grid.y,
         detail::VectorToArray<int, kRank>(config.x_strides),
         detail::VectorToArray<int, kReduceRank>(config.reduce_dim),
@@ -621,7 +621,7 @@ template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
           typename TransformOp>
 static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
                                const ReduceOp& reducer,
-                               const TransformOp& transformer,
+                               const TransformOp& transformer, Ty init,
                                gpuStream_t stream, ReduceConfig<Ty> config) {
   int reduce_rank = config.reduce_strides.size();
   int rank = config.x_strides.size();
@@ -636,7 +636,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
   case i: {                                                                    \
     constexpr auto kReduceRank = i;                                            \
     LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
-        x_data, y_data, reducer, transformer, stream, config);                 \
+        x_data, y_data, reducer, transformer, init, stream, config);           \
   } break
 
   detail::CheckReduceRank(reduce_rank, rank);
@@ -711,8 +711,8 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
   case block_dim: {                                                       \
     constexpr auto kBlockDim = block_dim;                                 \
     LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp<Tx, Ty>, TransformOp>( \
-        x_data, y_data, reducer, TransformOp(config.reduce_num), stream,  \
-        config);                                                          \
+        x_data, y_data, reducer, TransformOp(config.reduce_num),          \
+        reducer.initial(), stream, config);                               \
   } break
 
   switch (detail::GetBlockDim(config.reduce_num)) {

From fa3411c800f3f67d34395587f11016541715897f Mon Sep 17 00:00:00 2001
From: niuliling123 <sdust_niull@163.com>
Date: Thu, 17 Jun 2021 15:20:57 +0000
Subject: [PATCH 34/34] add left_num * grid.z * grid.y

---
 paddle/fluid/operators/reduce_ops/reduce_op.cu.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
index 4da57015bb105..5fad6efdb3496 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h
@@ -191,7 +191,7 @@ struct ReduceConfig {
     if (should_reduce_again) {
       output_data = tmp->mutable_data<Ty>(
           framework::make_ddim(
-              {static_cast<int64_t>(left_num * grid.y * sizeof(Ty))}),
+              {static_cast<int64_t>(left_num * grid.z * grid.y * sizeof(Ty))}),
           place);
     } else {
       output_data = y_data;
@@ -674,10 +674,11 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
   auto x_data = x.data<Tx>();
   auto y_data = y->mutable_data<Ty>(x.place());
 
-  framework::Tensor tmp;
+  // after config.run()
   // SetOutputData for ReduceHigherDim when should_reduce_again is true,
   //   temp_output should be stored temp_data in output_data space or stored in
   //   y_data;
+  framework::Tensor tmp;
   config.SetOutputData(y_data, x.place(), &tmp);
 
   if (config.reduce_num == 1) {