Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify the reduce op according to the kernel primitive api #35282

Merged
merged 30 commits into from
Sep 8, 2021
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
7d58b91
Merge pull request #1 from PaddlePaddle/develop
AnnaTrainingG Mar 25, 2021
1021e08
Merge pull request #2 from PaddlePaddle/develop
AnnaTrainingG Mar 29, 2021
43f53fe
Merge pull request #3 from PaddlePaddle/develop
AnnaTrainingG Apr 19, 2021
d25ab26
Merge pull request #4 from PaddlePaddle/develop
AnnaTrainingG May 7, 2021
8c8717f
Merge pull request #5 from PaddlePaddle/develop
AnnaTrainingG May 25, 2021
9ddf5e8
Merge pull request #6 from PaddlePaddle/develop
AnnaTrainingG May 26, 2021
b0cbcca
Merge pull request #9 from PaddlePaddle/develop
AnnaTrainingG Jun 1, 2021
cdecaf0
Merge pull request #14 from PaddlePaddle/develop
AnnaTrainingG Jun 11, 2021
0da14c9
Merge pull request #16 from PaddlePaddle/develop
AnnaTrainingG Jun 15, 2021
ca95763
Merge pull request #17 from PaddlePaddle/develop
AnnaTrainingG Jun 22, 2021
25ba21c
Merge pull request #18 from PaddlePaddle/develop
AnnaTrainingG Jul 5, 2021
3ce9983
Merge pull request #19 from PaddlePaddle/develop
AnnaTrainingG Jul 6, 2021
61842ed
Merge pull request #20 from PaddlePaddle/develop
AnnaTrainingG Jul 12, 2021
0e2c73b
Merge pull request #21 from PaddlePaddle/develop
AnnaTrainingG Jul 28, 2021
c1e59cf
Merge pull request #22 from PaddlePaddle/develop
AnnaTrainingG Aug 2, 2021
3a54149
Merge pull request #23 from PaddlePaddle/develop
AnnaTrainingG Aug 4, 2021
7addd79
Merge pull request #24 from PaddlePaddle/develop
AnnaTrainingG Aug 11, 2021
1e843d1
Merge pull request #25 from PaddlePaddle/develop
AnnaTrainingG Aug 23, 2021
2783c76
commit for pool higher preformance
AnnaTrainingG Aug 31, 2021
73d13a3
update ReduceMode
AnnaTrainingG Aug 31, 2021
b0e3fdb
update ReduceMode
AnnaTrainingG Aug 31, 2021
e1a92d6
Merge pull request #26 from PaddlePaddle/develop
AnnaTrainingG Sep 1, 2021
9349c38
Merge branch 'develop' of https://github.com/niuliling123/Paddle into…
AnnaTrainingG Sep 2, 2021
2c32248
Add API comments and specify variable names
AnnaTrainingG Sep 2, 2021
05da032
Merge pull request #27 from PaddlePaddle/develop
AnnaTrainingG Sep 3, 2021
840a652
update
AnnaTrainingG Sep 4, 2021
fc36b45
update
AnnaTrainingG Sep 4, 2021
e1fe6dc
Merge pull request #28 from PaddlePaddle/develop
AnnaTrainingG Sep 6, 2021
d9b9f42
Merge branch 'develop' of https://github.com/niuliling123/Paddle into…
AnnaTrainingG Sep 6, 2021
51f2f77
update detail to details
AnnaTrainingG Sep 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 178 additions & 44 deletions paddle/fluid/operators/kernel_primitives/compute_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,25 @@
#include <hip/hip_fp16.h>
#endif

#include <algorithm>
// #include <algorithm>
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace details {

#ifdef __HIPCC__
constexpr int kMaxThread = 256;
constexpr int kWarpSize = 64;
#else
constexpr int kMaxThread = 128;
constexpr int kWarpSize = 32;
#endif

enum ReduceMode { kGlobalMode, kLocalMode };

template <typename T>
class MPTypeTrait {
public:
Expand All @@ -41,37 +52,98 @@ class MPTypeTrait<platform::float16> {
using Type = float;
};

} // namespace details
/**
* @brief will be used in BlockYReduce, get the index of reduce_num in shared
* memory
*/
__device__ __forceinline__ int SharedMemoryIndex(int index) {
return (threadIdx.y + index) * blockDim.x + threadIdx.x;
}

/*************************** Compute Functor****************************/
template <typename T, typename Enable = void>
struct DivFunctor {
inline HOSTDEVICE T operator()(const T* args) const {
return args[0] / args[1];
template <typename T, typename ReduceOp>
__device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) {
unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int stride = details::kWarpSize / 2; stride > 0; stride >>= 1) {
T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
val = reducer(val, temp);
}
};
return val;
}

template <typename T>
struct DivFunctor<T, typename std::enable_if_t<std::is_integral<T>::value>> {
inline HOSTDEVICE T operator()(const T* args) const {
PADDLE_ENFORCE(args[1] != 0,
platform::errors::InvalidArgument(
"Invalid Argument Error: Integer division by zero "
"encountered in divide. Please check the input value."));
return args[0] / args[1];
/* e.g.
* |---------block---------|
* |warp0|warp1|warp2|warp3|
* |0~31|32~63|64~95|96~127| ---->blockDim.x = 128
* \|/ \|/ \|/ \|/ ---->1. First WarpReduce in each warp
* res0 res1 res2 res3 ---->2. Store result of each warp to shared memory
* \ \ / / ---->3. Load the result above from shared memory
* res to warp0 and process the second WarpReduce
*/

/**
* @brief BlockXReduce reduce along blockDim.x
*/
template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
__syncthreads();
using details::kWarpSize;
__shared__ T shared[2 * kWarpSize];
int block_dim_x = blockDim.x;
if (blockDim.x > kWarpSize) {
block_dim_x = blockDim.x / kWarpSize;
int lane = threadIdx.x % kWarpSize;
int tid = threadIdx.y * blockDim.x + threadIdx.x;
int wid = tid / kWarpSize;
int bid = threadIdx.y;
val = WarpReduce(val, reducer);
if (lane == 0) {
shared[wid] = val;
}
__syncthreads();
val = shared[bid * block_dim_x + lane];
}
};

unsigned mask = 0u;
CREATE_SHFL_MASK(mask, true);
for (int stride = 1; stride < block_dim_x; stride <<= 1) {
T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride);
val = reducer(val, temp);
}
return val;
}

/**
* @brief BlockYReduce reduce along blockDim.y
*/
template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
__shared__ T shared_memory[details::kMaxThread];
shared_memory[SharedMemoryIndex(0)] = val;
for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {
__syncthreads();
if (threadIdx.y < stride && threadIdx.y + stride < blockDim.y) {
T temp = shared_memory[SharedMemoryIndex(stride)];
val = reducer(val, temp);
}
shared_memory[SharedMemoryIndex(0)] = val;
}
return val;
}

} // namespace details

/*************************** Compute Function****************************/

/**
* @brief compute functor for elementwise_two, in1 and in2 has the same shape
* @brief binary function, in1 and in2 have same shape
* @param:
* T : the type of in1 and in2
* NX: the row of in1 and in2
* NY: the col of in1 and in2
* BlockSize: the strid of col
* OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL
* T: data type of in1, in2
* OutT: data type of out
* NX: the cols of in1, in2
* NY: the rows of in1, in2
* BlockSize: the config of this device
* OpFunc: compute functor eg: in1 + in2, in1 - in2
*/
template <typename T, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
Expand All @@ -88,32 +160,40 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
}

/**
* @brief fma eg: a * b + c, in1 in2, in3 and out has the same shape
* @brief ternary function, in1, in2 and in3 have same shape
* @param:
* T : the type of in1 and in2, in3
* NX: the row of in1, in2 and in3
* NY: the col of in1, in2 and in3
* BlockSize: the strid of col
* T: data type of in1, in2, in3
* OutT: data type of out
* NX: the cols of in1, in2
* NY: the rows of in1, in2
* BlockSize: the config of this device
* OpFunc: compute functor eg: out = in1 * in2 + in3
*/
template <typename T, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseFma(OutT* out, const T* in1,
const T* in2, const T* in3,
OpFunc compute) {
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
const T* in2, const T* in3,
OpFunc compute) {
T args[3];
#pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
args[0] = in1[idx];
args[1] = in2[idx];
args[2] = in3[idx];
out[idx] = static_cast<OutT>(compute(args));
}
}

/**
* @brief compute functor for elementwise_two, in1 is [1, NY], in2 is [NX, NY]
* @brief cycle binary function, in1's shape size is [1, NX], in2's shape size
* is [NY, NX], out's shape size is [NY, NX]
* @param:
* T : the type of in1 and in2
* NX: the row of in1 and in2
* NY: the col of in2
* BlockSize: the strid of col
* OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL
* T: data type of in1, in2
* OutT: data type of out
* NX: the cols of in1, in2
* NY: the rows of in1, in2
* BlockSize: the config of this device
* OpFunc: compute functor eg: in1 + in2, in1 - in2
*/
template <typename T, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
Expand All @@ -130,13 +210,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
}

/**
* @brief compute functor for unary, in1 is [NX, NY]
* @brief unary function
* @param:
* T : the type of in
* NX: the row of in
* NY: the col of in
* BlockSize: the strid of col
* OpFunc: compute functor eg: relu, sigmoid, exp
* T: data type of in
* OutT: data type of out
* NX: the cols of in
* NY: the rows of in
* BlockSize: the config of this device
* OpFunc: compute functor eg: relu, exp
*/
template <typename T, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
Expand All @@ -148,6 +229,59 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
}
}

/**
* @brief reduce function, in's shape size is [NX, NY].
* If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1],
* if ReduceMode == kGlobalMode then reduce between different threads, the
* shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was
* split, BlockYReduce will be called. If reduce_last_dim is true and
* reduce_num was split, BlockXReduce will be called
* @typename:
* T: data type of in
* NX: the cols of in
* NY: the rows of in
* BlockSize: the config of this device
* OpFunc: reduce functor, eg: CustomSum, CustomMean in reduce_functor_op.h
* @param:
* reducer: reduce functor, eg: CustomSum<T>()
* reduce_last_dim: if in's last dim need to be reduce then reduce_last_dim =
* true
*/
template <typename T, int NX, int NY, int BlockSize, class OpFunc,
details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
bool reduce_last_dim) {
int block_index = blockDim.y;

if (Mode == details::ReduceMode::kGlobalMode) {
bool block_reduce_y = (!reduce_last_dim) && (block_index > 1);
// when reduce is not required for the last dim, and reduce num has been
// split into multiple threads
if (block_reduce_y) {
#pragma unroll
for (int i = 0; i < NY * NX; i++) { // reduce along blockdim.y
out[i] = details::BlockYReduce<T, OpFunc>(out[i], reducer);
}
}

// when last dimension need to be reduced
if (reduce_last_dim) {
#pragma unroll
for (int i = 0; i < NY * NX; i++) { // reduce along blockDim.x
out[i] = details::BlockXReduce<T, OpFunc>(out[i], reducer);
}
}
} else { // else kLocalMode
#pragma unroll
for (int i = 0; i < NY; ++i) {
#pragma unroll
for (int j = 0; j < NX; ++j) {
out[i] = reducer(out[i], in[i * NX + j]);
}
}
}
}

} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
Loading