Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sum kernel for CPU supporting BF16 and SelectedRows #32631

Merged
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
693a0ad
- registerred sum bf16 kernel of CPU
jczaja Mar 23, 2021
c845581
- first compilable version
jczaja Mar 23, 2021
c8f270d
- implemented BLAS API AXPY via oneDNN
jczaja Mar 25, 2021
cac6ab0
Python UT for sum with selected rows.
arogowie-intel Apr 27, 2021
9bb92f4
Restore random number data initialization for SGD BF16 op.
arogowie-intel Apr 27, 2021
c5abc15
Fix UT
arogowie-intel Apr 27, 2021
12c2620
Simple version for AXPY kernel on CPU supporting bf16.
arogowie-intel Apr 27, 2021
25c48f3
Better use of SFINAE and removal of unnecessary parameters.
arogowie-intel Apr 27, 2021
1b6c3f9
Templetize oneDNN AXPY handler and skip scaling if possible.
arogowie-intel Apr 27, 2021
3d9867e
Small refinements.
arogowie-intel Apr 28, 2021
c66f094
Merge remote-tracking branch 'upstream/develop' into aosewski/sum_sel…
arogowie-intel Apr 28, 2021
85a59aa
Skip test if bfloat16 not supported.
arogowie-intel Apr 28, 2021
fe7ae3b
Remove __restrict__ keyword as it is compiler specific.
arogowie-intel Apr 28, 2021
e435779
Review comments & attempt to fix CUDA compilation on CI.
arogowie-intel Apr 29, 2021
0ac50ac
Try avoid intermediate cast to float for bfloat16 data type.
arogowie-intel Apr 29, 2021
eb0a070
An attempt to satisfy crying coverage CI.
arogowie-intel Apr 30, 2021
947b686
Merge remote-tracking branch 'upstream/develop' into aosewski/sum_sel…
arogowie-intel Apr 30, 2021
e77c8ac
Remove not used if branch.
arogowie-intel Apr 30, 2021
32879a8
Remove unused oneDNN AXPY handler.
arogowie-intel May 5, 2021
616ee30
Merge remote-tracking branch 'upstream/develop' into aosewski/sum_sel…
arogowie-intel May 5, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions paddle/fluid/operators/math/blas_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#ifdef PADDLE_WITH_MKLML
#include <mkl.h>
#endif

#include <algorithm>
#include <cmath>
#include <limits>
Expand All @@ -28,6 +29,27 @@
namespace paddle {
namespace operators {
namespace math {
namespace detail {

template <typename T>
static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
const int incy) {
// Y = Y + apha * X
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
if (static_cast<float>(alpha) == 1.f) {
while (n-- > 0) {
*y += *x;
y = y + incy;
x = x + incx;
}
} else {
while (n-- > 0) {
*y += alpha * *x;
y = y + incy;
x = x + incx;
}
}
}
} // namespace detail

template <typename T>
struct CBlas;
Expand All @@ -43,6 +65,11 @@ struct CBlas<int8_t> {

template <>
struct CBlas<platform::bfloat16> {
template <typename... ARGS>
static void AXPY(ARGS... args) {
detail::axpy(args...);
}

template <typename... ARGS>
static void VCOPY(ARGS... args) {
PADDLE_THROW(platform::errors::Unimplemented(
Expand Down
40 changes: 20 additions & 20 deletions paddle/fluid/operators/math/selected_rows_functor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
platform::bfloat16>;

// This is a separated namespace for manipulate SelectedRows typed
// data. Like merge duplicated rows, adding two SelectedRows etc.
Expand All @@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
// add or mul.
namespace scatter {

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
blas->AXPY(data_len, 1., in, out);
template <typename T>
typename std::enable_if<std::is_floating_point<T>::value>::type
elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
const T* in, T* out) {
blas->AXPY(data_len, T(1.), in, out);
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
}

template <typename DeviceContext, typename T>
typename std::enable_if<
!std::is_floating_point<T>::value &&
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
size_t data_len, const T* in, T* out) {
template <typename T>
typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
T* out) {
for (size_t i = 0; i < data_len; i++) {
out[i] += in[i];
}
Expand Down Expand Up @@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out.set_rows(merge_rows);

math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), 0.0);
constant_functor(context, out.mutable_value(), static_cast<T>(0.0));
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved

std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
Expand All @@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {

for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
}
}
}
Expand Down Expand Up @@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {

for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
}
}
size_t input_width_cast = static_cast<size_t>(input_width);
Expand All @@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex64>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex128>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::bfloat16>;

template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/operators/sum_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
ops::SumKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>,
ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
68 changes: 68 additions & 0 deletions paddle/fluid/platform/mkldnn_reuse.h
Original file line number Diff line number Diff line change
Expand Up @@ -961,6 +961,74 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
std::vector<int> logical_axis_;
};

template <typename T>
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
public:
AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, int n, float alpha)
: platform::MKLDNNHandlerT<T, dnnl::reorder>(
dev_ctx, mkldnn_engine, cpu_place,
platform::CreateKey(dev_ctx, static_cast<int64_t>(n),
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
platform::MKLDNNGetDataType<T>(), alpha,
"-axpy")),
alpha_(alpha),
n_(n) {}

std::shared_ptr<mkldnn::memory> AcquireMemory(void* ptr,
const std::string& suffix) {
/*Generate key*/
auto local_key = this->key_ + suffix;
auto mem_p = std::static_pointer_cast<mkldnn::memory>(
this->dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType<T>(),
dnnl::memory::format_tag::x);
mem_p = std::make_shared<mkldnn::memory>(md, this->engine_, ptr);
this->dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
}
return mem_p;
}

std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const T* x) {
return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p");
}

std::shared_ptr<mkldnn::memory> AcquireDstMemory(T* y) {
return this->AcquireMemory(y, "@user_dst_mem_p");
}

std::shared_ptr<mkldnn::reorder> AcquireReorder(
std::shared_ptr<mkldnn::memory> dst_memory_p,
std::shared_ptr<mkldnn::memory> src_memory_p) {
auto prim_key = this->key_ + "@reorder_p";
auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
this->dev_ctx_.GetBlob(prim_key));
if (reorder_p == nullptr) {
// Here we pass Postops to mimick y -> a*X + y
mkldnn::primitive_attr reorder_attr;
mkldnn::post_ops post_operations;
if (this->alpha_ != 1.f) {
std::vector<float> scales(1, this->alpha_);
reorder_attr.set_output_scales(0, scales);
}
post_operations.append_sum(1.0f);

reorder_attr.set_post_ops(post_operations);
reorder_p = std::make_shared<mkldnn::reorder>(
*(src_memory_p), *(dst_memory_p), reorder_attr);
this->dev_ctx_.SetBlob(prim_key, reorder_p);
}
return reorder_p;
}

private:
float alpha_;
int n_;
};

class ReorderMKLDNNHandler : public MKLDNNHandler {
public:
ReorderMKLDNNHandler(std::vector<int64_t>& dims, // NOLINT
Expand Down
9 changes: 3 additions & 6 deletions python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
grad_selected_rows = scope.var('Grad').get_selected_rows()
grad_selected_rows.set_height(height)
grad_selected_rows.set_rows(rows)
# grad_array = np.random.random((len(rows), row_numel)).astype('float32')
grad_array = np.full((len(rows), row_numel), 2, np.float32)
grad_array = np.random.random((len(rows), row_numel)).astype('float32')
np_array_bf16 = convert_float_to_uint16(grad_array)

grad_tensor = grad_selected_rows.get_tensor()
Expand All @@ -87,8 +86,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):

def create_dense_param_var(self, scope, place, height, width):
param_tensor = scope.var('Param').get_tensor()
# param_array = np.random.random((height, width)).astype('float32')
param_array = np.full((height, width), 5, np.float32)
param_array = np.random.random((height, width)).astype('float32')
param_array_bf16 = convert_float_to_uint16(param_array)
param_tensor.set(param_array_bf16, place)

Expand All @@ -109,8 +107,7 @@ def create_sparse_param_var(self, scope, place, height, rows, row_numel):

def create_dense_lr_var(self, scope, place):
lr_tensor = scope.var('LearningRate').get_tensor()
# lr_value = np.random.uniform()
lr_value = 2
lr_value = np.random.uniform()
lr_array = np.full((1), lr_value, np.float32)
lr_array_bf16 = convert_float_to_uint16(lr_array)
lr_tensor.set(lr_array_bf16, place)
Expand Down
62 changes: 62 additions & 0 deletions python/paddle/fluid/tests/unittests/test_sum_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.tests.unittests.op_test import (
OpTest, convert_float_to_uint16, convert_uint16_to_float)


class TestSumOp(OpTest):
Expand Down Expand Up @@ -141,6 +143,64 @@ def test_w_is_selected_rows(self):
self.check_with_place(place, inplace)


@unittest.skipIf(not core.supports_bfloat16(),
'place does not support BF16 evaluation')
class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
def setUp(self):
self.height = 10
self.row_numel = 12
self.rows = [0, 1, 2, 3, 4, 5, 6]
self.dtype = np.uint16
self.init_kernel_type()
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
np.random.seed(12345)
self.data = np.random.random((len(self.rows),
self.row_numel)).astype(np.float32)

def _get_array(self, rows, row_numel):
if len(rows) > 0:
return convert_float_to_uint16(self.data)
else:
return np.ndarray((0, row_numel), dtype=self.dtype)

def check_input_and_optput(self,
scope,
place,
inplace,
w1_has_data=False,
w2_has_data=False,
w3_has_data=False):

self.create_selected_rows(scope, place, "W1", w1_has_data)
self.create_selected_rows(scope, place, "W2", w2_has_data)
self.create_selected_rows(scope, place, "W3", w3_has_data)

# create Out Variable
if inplace:
out_var_name = "W1"
else:
out_var_name = "Out"
out = scope.var(out_var_name).get_selected_rows()

# create and run sum operator
sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
sum_op.run(scope, place)

has_data_w_num = 0
for has_data in [w1_has_data, w2_has_data, w3_has_data]:
if has_data:
has_data_w_num += 1

if has_data_w_num > 0:
self.assertEqual(len(out.rows()), 7)
out_bf16 = np.array(out.get_tensor())
out_fp32 = convert_uint16_to_float(out_bf16)
ref_fp32 = convert_uint16_to_float(
self._get_array(self.rows, self.row_numel)) * has_data_w_num
np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
else:
self.assertEqual(len(out.rows()), 0)


class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
def setUp(self):
self.height = 10
Expand Down Expand Up @@ -324,4 +384,6 @@ def test_list_of_none_input():
create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)

if __name__ == "__main__":
from paddle import enable_static
arogowie-intel marked this conversation as resolved.
Show resolved Hide resolved
enable_static()
unittest.main()