Skip to content

Commit

Permalink
update TensorReduceFunc
Browse files Browse the repository at this point in the history
  • Loading branch information
AnnaTrainingG committed Jun 2, 2021
1 parent 790173a commit 8700894
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 46 deletions.
2 changes: 1 addition & 1 deletion paddle/fluid/operators/reduce_ops/reduce_max_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class ReduceMaxKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream();
TensorReduceFunc<T, T, CustomMax<T>, detail::IdentityFunctor<T>>(
*input, output, reduce_dims, DataBound<T>::min(), CustomMax<T>(),
detail::IdentityFunctor<T>(), stream);
detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
}
};

Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/reduce_ops/reduce_min_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class ReduceMinKernel : public framework::OpKernel<T> {
auto stream = context.cuda_device_context().stream();
TensorReduceFunc<T, T, CustomMin<T>, detail::IdentityFunctor<T>>(
*input, output, reduce_dims, DataBound<T>::max(), CustomMin<T>(),
detail::IdentityFunctor<T>(), stream);
detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
}
};

Expand Down
93 changes: 62 additions & 31 deletions paddle/fluid/operators/reduce_ops/reduce_op.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ namespace operators {
namespace detail {

// Post processing function for sum, max, min, prod, any
template <typename T>
template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
HOSTDEVICE explicit inline IdentityFunctor() {}

HOSTDEVICE inline T operator()(const T& x) const { return x; }
HOSTDEVICE inline Ty operator()(const Tx& x) const {
return static_cast<Ty>(x);
}
};

// Post processing function for mean
Expand Down Expand Up @@ -81,7 +83,7 @@ static inline std::vector<int> GetDimStrides(const std::vector<int>& dims,
#ifdef __HIPCC__
constexpr int kMaxBlock = 256;
#else
constexpr int kMaxBlock = 512;
constexpr int kMaxBlock = 128;
#endif

// get blockDim for reduceLastDim and reduceAny
Expand Down Expand Up @@ -544,8 +546,7 @@ __global__ void ReduceKernelFunction(

template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp, int kRank, int kReduceRank>
static void LaunchKernel(const Tx* x_data, Ty* y_data,
const platform::Place& place, const ReduceOp& reducer,
static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
const TransformOp& transformer, const Ty& init,
gpuStream_t stream, ReduceConfig<Ty> config) {
#define CUB_REDUCE_TYPE_CASE(type) \
Expand Down Expand Up @@ -589,7 +590,6 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data,
template <typename Tx, typename Ty, int BlockDim, typename ReduceOp,
typename TransformOp>
static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
const platform::Place& place,
const ReduceOp& reducer,
const TransformOp& transformer, const Ty& init,
gpuStream_t stream, ReduceConfig<Ty> config) {
Expand All @@ -606,26 +606,9 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
case i: { \
constexpr auto kReduceRank = i; \
LaunchKernel<Tx, Ty, BlockDim, ReduceOp, TransformOp, kRank, kReduceRank>( \
x_data, y_data, place, reducer, transformer, init, stream, config); \
x_data, y_data, reducer, transformer, init, stream, config); \
} break

// launch CUB::Reduce
if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(
x_data, transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
config.reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
place);
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
config.reduce_num, reducer, init, stream);

return;
}

detail::CheckReduceRank(reduce_rank, rank);
switch (rank) {
CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1););
Expand All @@ -649,10 +632,12 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
#undef CUB_RANK_CASE
}

template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp>
template <typename Tx, typename Ty, typename ReduceOp, typename TransformOp,
typename CubTransformOp = TransformOp>
void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims, const Ty& init,
const ReduceOp& reducer, const TransformOp& transformer,
const CubTransformOp& cub_transformer,
gpuStream_t stream) {
auto x_dim = framework::vectorize<int>(x.dims());
auto config = ReduceConfig<Ty>(origin_reduce_dims, x_dim);
Expand All @@ -673,13 +658,28 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
y->Resize(out_dims);
return;
}
// launch CUB::Reduce
if (config.reduce_type == static_cast<int>(ReduceType::kReduceAll)) {
cub::TransformInputIterator<Ty, CubTransformOp, const Tx*> trans_x(
x_data, cub_transformer);
size_t temp_storage_bytes = 0;
cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data,
config.reduce_num, reducer, init, stream);
framework::Tensor tmp;
auto* temp_storage = tmp.mutable_data<uint8_t>(
framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
x.place());
cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data,
config.reduce_num, reducer, init, stream);

return;
}

#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, x.place(), reducer, transformer, init, stream, \
config); \
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp, TransformOp>( \
x_data, y_data, reducer, transformer, init, stream, config); \
} break

switch (detail::GetBlockDim(config.reduce_num)) {
Expand All @@ -696,5 +696,36 @@ void TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y,
#undef CUB_BLOCK_DIM_CASE
}

template <typename Tx, typename ReduceOp,
template <typename, typename> class TransformOp>
struct TensorReduceFunctorImpl {
const framework::Tensor& x;
framework::Tensor* y;
std::vector<int> origin_reduce_dims;
const double& init;
const ReduceOp& reducer;
gpuStream_t stream;
TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
std::vector<int> origin_reduce_dims,
const double& init, const ReduceOp& reducer,
gpuStream_t stream)
: x(x),
y(y),
origin_reduce_dims(origin_reduce_dims),
init(init),
reducer(reducer),
stream(stream) {}

template <typename Ty>

void apply() const {
const Ty& init_cast = static_cast<Ty>(init);
TensorReduceFunc<Tx, Ty, ReduceOp, TransformOp<Ty, Ty>,
TransformOp<Tx, Ty>>(x, y, origin_reduce_dims, init_cast,
reducer, TransformOp<Ty, Ty>(),
TransformOp<Tx, Ty>(), stream);
}
};

} // namespace operators
} // namespace paddle
19 changes: 6 additions & 13 deletions paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,15 @@ class ReduceProdKernel : public framework::OpKernel<T> {

auto stream = context.cuda_device_context().stream();
if (out_dtype >= 0) {
#define VisitDataTypeSmall_t(cpp_type, proto_type) \
do { \
if (static_cast<framework::proto::VarType::Type>(out_dtype) == \
proto_type) { \
TensorReduceFunc<T, cpp_type, CustomMul<cpp_type>, \
detail::IdentityFunctor<cpp_type>>( \
*input, output, reduce_dims, static_cast<cpp_type>(1.0f), \
CustomMul<cpp_type>(), detail::IdentityFunctor<cpp_type>(), stream); \
} \
} while (0)
_ForEachDataTypeSmall_(VisitDataTypeSmall_t);
#undef VisitDataTypeSmall_t
framework::VisitDataTypeSmall(
static_cast<framework::proto::VarType::Type>(out_dtype),
TensorReduceFunctorImpl<T, cub::Sum, detail::IdentityFunctor>(
*input, output, reduce_dims, static_cast<double>(1.0f),
cub::Sum(), stream));
} else {
TensorReduceFunc<T, T, CustomMul<T>, detail::IdentityFunctor<T>>(
*input, output, reduce_dims, static_cast<T>(1.0f), CustomMul<T>(),
detail::IdentityFunctor<T>(), stream);
detail::IdentityFunctor<T>(), detail::IdentityFunctor<T>(), stream);
}
}
};
Expand Down

0 comments on commit 8700894

Please sign in to comment.