PaddlePaddle · luotao1 · May 6, 2021 · Mar 23, 2021 · Mar 23, 2021 · Mar 25, 2021
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
@@ -15,6 +15,7 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl.h>
 #endif
+
 #include <algorithm>
 #include <cmath>
 #include <limits>
@@ -28,6 +29,27 @@
 namespace paddle {
 namespace operators {
 namespace math {
+namespace detail {
+
+template <typename T>
+static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
+                 const int incy) {
+  // Y = Y + apha * X
+  if (static_cast<float>(alpha) == 1.f) {
+    while (n-- > 0) {
+      *y += *x;
+      y = y + incy;
+      x = x + incx;
+    }
+  } else {
+    while (n-- > 0) {
+      *y += alpha * *x;
+      y = y + incy;
+      x = x + incx;
+    }
+  }
+}
+}  // namespace detail
 
 template <typename T>
 struct CBlas;
@@ -43,6 +65,11 @@ struct CBlas<int8_t> {
 
 template <>
 struct CBlas<platform::bfloat16> {
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    detail::axpy(args...);
+  }
+
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     PADDLE_THROW(platform::errors::Unimplemented(

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
+                                        platform::bfloat16>;
 
 // This is a separated namespace for manipulate SelectedRows typed
 // data. Like merge duplicated rows, adding two SelectedRows etc.
@@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
 // add or mul.
 namespace scatter {
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
-  blas->AXPY(data_len, 1., in, out);
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  blas->AXPY(data_len, T(1.), in, out);
 }
 
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    !std::is_floating_point<T>::value &&
-    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
-elementwise_add_to(const DeviceContext& ctx, BlasT<DeviceContext, T>* blas,
-                   size_t data_len, const T* in, T* out) {
+template <typename T>
+typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
+    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    T* out) {
   for (size_t i = 0; i < data_len; i++) {
     out[i] += in[i];
   }
@@ -412,7 +410,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
       out.set_rows(merge_rows);
 
       math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-      constant_functor(context, out.mutable_value(), 0.0);
+      constant_functor(context, out.mutable_value(), static_cast<T>(0.0));
 
       std::unordered_map<int64_t, size_t> rows_to_id;
       for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -429,9 +427,9 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
         for (size_t i = 0; i < input_rows.size(); i++) {
           size_t out_i = rows_to_id[input_rows[i]];
-          elementwise_add_to<platform::CPUDeviceContext, T>(
-              context, &blas, static_cast<size_t>(input_width),
-              &input_data[i * input_width], &out_data[out_i * input_width]);
+          elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                                &input_data[i * input_width],
+                                &out_data[out_i * input_width]);
         }
       }
     }
@@ -524,9 +522,9 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
       for (size_t i = 0; i < input_rows.size(); i++) {
         size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+        elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
+                              &input_data[i * input_width],
+                              &out_data[out_i * input_width]);
       }
     }
     size_t input_width_cast = static_cast<size_t>(input_width);
@@ -547,6 +545,8 @@ template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex64>;
 template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::complex128>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::bfloat16>;
 
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;

diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
@@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL(
     sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::bfloat16>,
     ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
@@ -961,6 +961,74 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
   std::vector<int> logical_axis_;
 };
 
+template <typename T>
+class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
+ public:
+  AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                    const mkldnn::engine mkldnn_engine,
+                    platform::Place cpu_place, int n, float alpha)
+      : platform::MKLDNNHandlerT<T, dnnl::reorder>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            platform::CreateKey(dev_ctx, static_cast<int64_t>(n),
+                                platform::MKLDNNGetDataType<T>(), alpha,
+                                "-axpy")),
+        alpha_(alpha),
+        n_(n) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = this->key_ + suffix;
+    auto mem_p = std::static_pointer_cast<mkldnn::memory>(
+        this->dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType<T>(),
+                                     dnnl::memory::format_tag::x);
+      mem_p = std::make_shared<mkldnn::memory>(md, this->engine_, ptr);
+      this->dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const T* x) {
+    return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(T* y) {
+    return this->AcquireMemory(y, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::reorder> AcquireReorder(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    auto prim_key = this->key_ + "@reorder_p";
+    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        this->dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      // Here we pass Postops to mimick y -> a*X + y
+      mkldnn::primitive_attr reorder_attr;
+      mkldnn::post_ops post_operations;
+      if (this->alpha_ != 1.f) {
+        std::vector<float> scales(1, this->alpha_);
+        reorder_attr.set_output_scales(0, scales);
+      }
+      post_operations.append_sum(1.0f);
+
+      reorder_attr.set_post_ops(post_operations);
+      reorder_p = std::make_shared<mkldnn::reorder>(
+          *(src_memory_p), *(dst_memory_p), reorder_attr);
+      this->dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
+ private:
+  float alpha_;
+  int n_;
+};
+
 class ReorderMKLDNNHandler : public MKLDNNHandler {
  public:
   ReorderMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT

diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -76,8 +76,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
         grad_selected_rows = scope.var('Grad').get_selected_rows()
         grad_selected_rows.set_height(height)
         grad_selected_rows.set_rows(rows)
-        # grad_array = np.random.random((len(rows), row_numel)).astype('float32')
-        grad_array = np.full((len(rows), row_numel), 2, np.float32)
+        grad_array = np.random.random((len(rows), row_numel)).astype('float32')
         np_array_bf16 = convert_float_to_uint16(grad_array)
 
         grad_tensor = grad_selected_rows.get_tensor()
@@ -87,8 +86,7 @@ def create_sparse_grad_var(self, scope, place, height, rows, row_numel):
 
     def create_dense_param_var(self, scope, place, height, width):
         param_tensor = scope.var('Param').get_tensor()
-        # param_array = np.random.random((height, width)).astype('float32')
-        param_array = np.full((height, width), 5, np.float32)
+        param_array = np.random.random((height, width)).astype('float32')
         param_array_bf16 = convert_float_to_uint16(param_array)
         param_tensor.set(param_array_bf16, place)
 
@@ -109,8 +107,7 @@ def create_sparse_param_var(self, scope, place, height, rows, row_numel):
 
     def create_dense_lr_var(self, scope, place):
         lr_tensor = scope.var('LearningRate').get_tensor()
-        # lr_value = np.random.uniform()
-        lr_value = 2
+        lr_value = np.random.uniform()
         lr_array = np.full((1), lr_value, np.float32)
         lr_array_bf16 = convert_float_to_uint16(lr_array)
         lr_tensor.set(lr_array_bf16, place)

diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -21,6 +21,8 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+from paddle.fluid.tests.unittests.op_test import (
+    OpTest, convert_float_to_uint16, convert_uint16_to_float)
 
 
 class TestSumOp(OpTest):
@@ -141,6 +143,64 @@ def test_w_is_selected_rows(self):
                 self.check_with_place(place, inplace)
 
 
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSelectedRowsSumBF16Op(TestSelectedRowsSumOp):
+    def setUp(self):
+        self.height = 10
+        self.row_numel = 12
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.dtype = np.uint16
+        self.init_kernel_type()
+        np.random.seed(12345)
+        self.data = np.random.random((len(self.rows),
+                                      self.row_numel)).astype(np.float32)
+
+    def _get_array(self, rows, row_numel):
+        if len(rows) > 0:
+            return convert_float_to_uint16(self.data)
+        else:
+            return np.ndarray((0, row_numel), dtype=self.dtype)
+
+    def check_input_and_optput(self,
+                               scope,
+                               place,
+                               inplace,
+                               w1_has_data=False,
+                               w2_has_data=False,
+                               w3_has_data=False):
+
+        self.create_selected_rows(scope, place, "W1", w1_has_data)
+        self.create_selected_rows(scope, place, "W2", w2_has_data)
+        self.create_selected_rows(scope, place, "W3", w3_has_data)
+
+        # create Out Variable
+        if inplace:
+            out_var_name = "W1"
+        else:
+            out_var_name = "Out"
+        out = scope.var(out_var_name).get_selected_rows()
+
+        # create and run sum operator
+        sum_op = Operator("sum", X=["W1", "W2", "W3"], Out=out_var_name)
+        sum_op.run(scope, place)
+
+        has_data_w_num = 0
+        for has_data in [w1_has_data, w2_has_data, w3_has_data]:
+            if has_data:
+                has_data_w_num += 1
+
+        if has_data_w_num > 0:
+            self.assertEqual(len(out.rows()), 7)
+            out_bf16 = np.array(out.get_tensor())
+            out_fp32 = convert_uint16_to_float(out_bf16)
+            ref_fp32 = convert_uint16_to_float(
+                self._get_array(self.rows, self.row_numel)) * has_data_w_num
+            np.testing.assert_allclose(out_fp32, ref_fp32, atol=0, rtol=0.95e-2)
+        else:
+            self.assertEqual(len(out.rows()), 0)
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
     def setUp(self):
         self.height = 10
@@ -324,4 +384,6 @@ def test_list_of_none_input():
 create_test_sum_fp16_class(TestLoDTensorAndSelectedRowsOp)
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()