From 2296053a3ac1b3a9dee33b6a83a91d18b042f76e Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Fri, 21 May 2021 12:07:35 +0200
Subject: [PATCH 01/13] Add oneDNN AXPY handler.

---
 paddle/fluid/operators/math/blas_impl.h     | 46 ++++++++++-
 paddle/fluid/platform/mkldnn/axpy_handler.h | 92 +++++++++++++++++++++
 2 files changed, 136 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/platform/mkldnn/axpy_handler.h
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 05d42f02c1003..5c2ae63ccf9da 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -21,6 +21,11 @@
 #include <limits>
 #include <vector>
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn/axpy_handler.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex128.h"
@@ -41,6 +46,40 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
     x = x + incx;
   }
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+
+static void onednn_handler_axpy(int n, platform::bfloat16 alpha,
+                                const platform::bfloat16 *x, int incx,
+                                platform::bfloat16 *y, int incy) {
+  // TODO(jczaja): support other increments values diffrent from 1
+  PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented(
+                                 "Blas AXPY support incx == 1 only"));
+  PADDLE_ENFORCE_EQ(incy, 1, platform::errors::Unimplemented(
+                                 "Blas AXPY support incy == 1 only"));
+
+  auto &pool = platform::DeviceContextPool::Instance();
+  auto cpu_place = platform::CPUPlace();
+  auto *dev_ctx =
+      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(cpu_place));
+  auto &cpu_engine = dev_ctx->GetEngine();
+
+  platform::AXPYMKLDNNHandler<platform::bfloat16> handler(
+      *dev_ctx, cpu_engine, cpu_place, n, static_cast<float>(alpha));
+
+  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
+  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
+  auto reorder_p =
+      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+
+  auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
+  platform::RecordEvent record_reorder("axpy_int_reorder",
+                                       platform::EventRole::kUniqueOp);
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+}
+#endif
+
 }  // namespace detail
 
 template <typename T>
@@ -57,11 +96,14 @@ struct CBlas<int8_t> {
 
 template <>
 struct CBlas<platform::bfloat16> {
+#ifdef PADDLE_WITH_MKLDNN
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-    detail::axpy(args...);
+    detail::onednn_handler_axpy(args...);
+#else
+  detail::axpy(args...);
+#endif
   }
-
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h
new file mode 100644
index 0000000000000..d970c571c8318
--- /dev/null
+++ b/paddle/fluid/platform/mkldnn/axpy_handler.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename T>
+class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
+ public:
+  AXPYMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
+                    const mkldnn::engine mkldnn_engine,
+                    platform::Place cpu_place, int n, float alpha)
+      : platform::MKLDNNHandlerT<T, dnnl::reorder>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            platform::CreateKey(dev_ctx, static_cast<int64_t>(n),
+                                platform::MKLDNNGetDataType<T>(), alpha,
+                                "-axpy")),
+        alpha_(alpha),
+        n_(n) {}
+
+  std::shared_ptr<mkldnn::memory> AcquireMemory(void* ptr,
+                                                const std::string& suffix) {
+    /*Generate key*/
+    auto local_key = this->key_ + suffix;
+    auto mem_p = std::static_pointer_cast<mkldnn::memory>(
+        this->dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType<T>(),
+                                     dnnl::memory::format_tag::x);
+      mem_p = std::make_shared<mkldnn::memory>(md, this->engine_, ptr);
+      this->dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const T* x) {
+    return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(T* y) {
+    return this->AcquireMemory(y, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<mkldnn::reorder> AcquireReorder(
+      std::shared_ptr<mkldnn::memory> dst_memory_p,
+      std::shared_ptr<mkldnn::memory> src_memory_p) {
+    auto prim_key = this->key_ + "@reorder_p";
+    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
+        this->dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      // Here we pass Postops to mimick y -> a*X + y
+      mkldnn::primitive_attr reorder_attr;
+      mkldnn::post_ops post_operations;
+      if (this->alpha_ != 1.f) {
+        std::vector<float> scales(1, this->alpha_);
+        reorder_attr.set_output_scales(0, scales);
+      }
+      post_operations.append_sum(1.0f);
+
+      reorder_attr.set_post_ops(post_operations);
+      reorder_p = std::make_shared<mkldnn::reorder>(
+          *(src_memory_p), *(dst_memory_p), reorder_attr);
+      this->dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
+ private:
+  float alpha_;
+  int n_;
+};
+
+}  // namespace platform
+}  // namespace paddle

From 3b7b23a7f7ffe5b05e5083340d188362263f4384 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Wed, 9 Jun 2021 19:04:39 +0200
Subject: [PATCH 02/13] Add fallback for small tensors.

---
 paddle/fluid/operators/math/blas_impl.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 693f17cf3d535..75005881664ec 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -51,6 +51,12 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
 static void onednn_handler_axpy(int n, platform::bfloat16 alpha,
                                 const platform::bfloat16 *x, int incx,
                                 platform::bfloat16 *y, int incy) {
+  // fallback to naive version
+  if (n < 100) {
+    axpy(n, alpha, x, incx, y, incy);
+    return;
+  }
+
   // TODO(jczaja): support other increments values diffrent from 1
   PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented(
                                  "Blas AXPY support incx == 1 only"));

From b2d6dfefd02231e3ff7a8c3f5936cecf0809a885 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Wed, 9 Jun 2021 19:05:17 +0200
Subject: [PATCH 03/13] Fix ifdefs

---
 paddle/fluid/operators/math/blas_impl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 75005881664ec..f713cdf21b898 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -101,12 +101,12 @@ struct CBlas<int8_t> {
 
 template <>
 struct CBlas<platform::bfloat16> {
-#ifdef PADDLE_WITH_MKLDNN
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
+#ifdef PADDLE_WITH_MKLDNN
     detail::onednn_handler_axpy(args...);
 #else
-  detail::axpy(args...);
+    detail::axpy(args...);
 #endif
   }
   template <typename... ARGS>

From 425a76b0c91b98388a32b3510dc9a5159d3f64e9 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 10:37:12 +0200
Subject: [PATCH 04/13] Remove unnecessary namespace prefixes and add missing
 headers.

---
 paddle/fluid/platform/mkldnn/axpy_handler.h | 25 ++++++++++++---------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h
index d970c571c8318..4aeb712d02191 100644
--- a/paddle/fluid/platform/mkldnn/axpy_handler.h
+++ b/paddle/fluid/platform/mkldnn/axpy_handler.h
@@ -13,24 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "mkldnn.hpp"
 #include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace platform {
 
 template <typename T>
-class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
+class AXPYMKLDNNHandler : public MKLDNNHandlerT<T, dnnl::reorder> {
  public:
-  AXPYMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
-                    const mkldnn::engine mkldnn_engine,
-                    platform::Place cpu_place, int n, float alpha)
-      : platform::MKLDNNHandlerT<T, dnnl::reorder>(
+  AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                    const mkldnn::engine mkldnn_engine, Place cpu_place, int n,
+                    float alpha)
+      : MKLDNNHandlerT<T, dnnl::reorder>(
             dev_ctx, mkldnn_engine, cpu_place,
-            platform::CreateKey(dev_ctx, static_cast<int64_t>(n),
-                                platform::MKLDNNGetDataType<T>(), alpha,
-                                "-axpy")),
+            CreateKey(dev_ctx, static_cast<int64_t>(n), MKLDNNGetDataType<T>(),
+                      alpha, "-axpy")),
         alpha_(alpha),
         n_(n) {}
 
@@ -41,7 +46,7 @@ class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
     auto mem_p = std::static_pointer_cast<mkldnn::memory>(
         this->dev_ctx_.GetBlob(local_key));
     if (mem_p == nullptr) {
-      auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType<T>(),
+      auto md = mkldnn::memory::desc({n_}, MKLDNNGetDataType<T>(),
                                      dnnl::memory::format_tag::x);
       mem_p = std::make_shared<mkldnn::memory>(md, this->engine_, ptr);
       this->dev_ctx_.SetBlob(local_key, mem_p);
@@ -52,7 +57,7 @@ class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::reorder> {
   }
 
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const T* x) {
-    return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p");
+    return this->AcquireMemory(to_void_cast(x), "@user_src_mem_p");
   }
 
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(T* y) {

From 62b1ada8c4729c7d60e89218f4dc0b307ee9be25 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 10:39:04 +0200
Subject: [PATCH 05/13] Guard handler_axpy with proper ifdefs.

* Compilation of this function is possible only when Paddle is not build
with CUDA nor HIP.
---
 paddle/fluid/operators/math/blas_impl.h | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index f713cdf21b898..89f1c2df6c691 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -21,9 +21,11 @@
 #include <limits>
 #include <vector>
 
-#ifdef PADDLE_WITH_MKLDNN
+#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
+    !defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn/axpy_handler.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/profiler.h"
 #endif
 
 #include "paddle/fluid/operators/math/math_function.h"
@@ -46,11 +48,11 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
   }
 }
 
-#ifdef PADDLE_WITH_MKLDNN
-
-static void onednn_handler_axpy(int n, platform::bfloat16 alpha,
-                                const platform::bfloat16 *x, int incx,
-                                platform::bfloat16 *y, int incy) {
+#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
+    !defined(PADDLE_WITH_HIP)
+template <typename T>
+static void onednn_handler_axpy(int n, T alpha, const T *x, int incx, T *y,
+                                int incy) {
   // fallback to naive version
   if (n < 100) {
     axpy(n, alpha, x, incx, y, incy);
@@ -69,8 +71,8 @@ static void onednn_handler_axpy(int n, platform::bfloat16 alpha,
       dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(cpu_place));
   auto &cpu_engine = dev_ctx->GetEngine();
 
-  platform::AXPYMKLDNNHandler<platform::bfloat16> handler(
-      *dev_ctx, cpu_engine, cpu_place, n, static_cast<float>(alpha));
+  platform::AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
+                                         static_cast<float>(alpha));
 
   auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
   auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
@@ -103,7 +105,8 @@ template <>
 struct CBlas<platform::bfloat16> {
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-#ifdef PADDLE_WITH_MKLDNN
+#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
+    !defined(PADDLE_WITH_HIP)
     detail::onednn_handler_axpy(args...);
 #else
     detail::axpy(args...);

From 6c28aa8e9d61de124d5505754f41fb01f416763d Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 15:13:37 +0200
Subject: [PATCH 06/13] Move AXPY handler code to separate files.

---
 paddle/fluid/operators/CMakeLists.txt         |   3 +
 paddle/fluid/operators/math/CMakeLists.txt    |   8 +-
 paddle/fluid/operators/math/blas_impl.h       |  51 -----
 .../operators/math/selected_rows_functor.cc   |   8 +
 paddle/fluid/operators/mkldnn/CMakeLists.txt  |   1 +
 paddle/fluid/operators/mkldnn/axpy_handler.cc | 174 ++++++++++++++++++
 paddle/fluid/operators/mkldnn/axpy_handler.h  |  33 ++++
 paddle/fluid/platform/mkldnn/axpy_handler.h   |  97 ----------
 8 files changed, 226 insertions(+), 149 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/CMakeLists.txt
 create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.cc
 create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.h
 delete mode 100644 paddle/fluid/platform/mkldnn/axpy_handler.h

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e645b379f3c06..5cfc0144c6bdf 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -20,6 +20,9 @@ add_subdirectory(optimizers)
 add_subdirectory(reduce_ops)
 add_subdirectory(sequence_ops)
 add_subdirectory(jit)
+if(WITH_MKLDNN)
+    add_subdirectory(mkldnn)
+endif()
 
 
 if(WITH_DISTRIBUTE)
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index fdbc0c68525ba..a13fffe15cf24 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
 math_library(maxouting)
 math_library(pooling)
-math_library(selected_rows_functor DEPS selected_rows math_function blas)
+
+if(WITH_MKLDNN)
+    math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler)
+else()
+    math_library(selected_rows_functor DEPS selected_rows math_function blas)
+endif()
+
 math_library(sequence2batch)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 89f1c2df6c691..e2aebc89bec90 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -21,13 +21,6 @@
 #include <limits>
 #include <vector>
 
-#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/mkldnn/axpy_handler.h"
-#include "paddle/fluid/platform/profiler.h"
-#endif
-
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
@@ -48,45 +41,6 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
   }
 }
 
-#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-template <typename T>
-static void onednn_handler_axpy(int n, T alpha, const T *x, int incx, T *y,
-                                int incy) {
-  // fallback to naive version
-  if (n < 100) {
-    axpy(n, alpha, x, incx, y, incy);
-    return;
-  }
-
-  // TODO(jczaja): support other increments values diffrent from 1
-  PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented(
-                                 "Blas AXPY support incx == 1 only"));
-  PADDLE_ENFORCE_EQ(incy, 1, platform::errors::Unimplemented(
-                                 "Blas AXPY support incy == 1 only"));
-
-  auto &pool = platform::DeviceContextPool::Instance();
-  auto cpu_place = platform::CPUPlace();
-  auto *dev_ctx =
-      dynamic_cast<platform::MKLDNNDeviceContext *>(pool.Get(cpu_place));
-  auto &cpu_engine = dev_ctx->GetEngine();
-
-  platform::AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
-                                         static_cast<float>(alpha));
-
-  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
-  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
-  auto reorder_p =
-      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
-
-  auto &astream = platform::MKLDNNDeviceContext::tls().get_stream();
-  platform::RecordEvent record_reorder("axpy_int_reorder",
-                                       platform::EventRole::kUniqueOp);
-  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-  astream.wait();
-}
-#endif
-
 }  // namespace detail
 
 template <typename T>
@@ -105,12 +59,7 @@ template <>
 struct CBlas<platform::bfloat16> {
   template <typename... ARGS>
   static void AXPY(ARGS... args) {
-#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-    detail::onednn_handler_axpy(args...);
-#else
     detail::axpy(args...);
-#endif
   }
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index ee405be5ae9a6..59502fa41cfa9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
+
 namespace paddle {
 namespace operators {
 namespace math {
@@ -302,7 +306,11 @@ typename std::enable_if<std::is_floating_point<T>::value ||
                         std::is_same<T, platform::complex<double>>::value>::type
 elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
                    const T* in, T* out) {
+#ifdef PADDLE_WITH_MKLDNN
+  onednn_handler_axpy(data_len, T(1.f), in, out);
+#else
   blas->AXPY(data_len, T(1.f), in, out);
+#endif
 }
 
 template <typename T>
diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt
new file mode 100644
index 0000000000000..ce95ec560c25e
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce)
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
new file mode 100644
index 0000000000000..a504f7f2c366a
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -0,0 +1,174 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+namespace plat = paddle::platform;
+
+namespace {
+
+template <typename T>
+class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT<T, dnnl::reorder> {
+ public:
+  AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx,
+                    const dnnl::engine mkldnn_engine, plat::Place cpu_place,
+                    int n, float alpha)
+      : plat::MKLDNNHandlerT<T, dnnl::reorder>(
+            dev_ctx, mkldnn_engine, cpu_place,
+            plat::CreateKey(dev_ctx, static_cast<int64_t>(n),
+                            plat::MKLDNNGetDataType<T>(), alpha, "-axpy")),
+        alpha_(alpha),
+        n_(n) {}
+
+  std::shared_ptr<dnnl::memory> AcquireMemory(void *ptr,
+                                              const std::string &suffix) {
+    /*Generate key*/
+    auto local_key = this->key_ + suffix;
+    auto mem_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType<T>(),
+                                   dnnl::memory::format_tag::x);
+      mem_p = std::make_shared<dnnl::memory>(md, this->engine_, ptr);
+      this->dev_ctx_.SetBlob(local_key, mem_p);
+    } else {
+      mem_p->set_data_handle(ptr);
+    }
+    return mem_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const T *x) {
+    return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p");
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(T *y) {
+    return this->AcquireMemory(y, "@user_dst_mem_p");
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireReorder(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p) {
+    auto prim_key = this->key_ + "@reorder_p";
+    auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
+        this->dev_ctx_.GetBlob(prim_key));
+    if (reorder_p == nullptr) {
+      // Here we pass Postops to mimick y -> a*X + y
+      dnnl::primitive_attr reorder_attr;
+      dnnl::post_ops post_operations;
+      if (this->alpha_ != 1.f) {
+        std::vector<float> scales(1, this->alpha_);
+        reorder_attr.set_output_scales(0, scales);
+      }
+      post_operations.append_sum(1.0f);
+
+      reorder_attr.set_post_ops(post_operations);
+      reorder_p = std::make_shared<dnnl::reorder>(
+          *(src_memory_p), *(dst_memory_p), reorder_attr);
+      this->dev_ctx_.SetBlob(prim_key, reorder_p);
+    }
+    return reorder_p;
+  }
+
+ private:
+  float alpha_;
+  int n_;
+};
+
+template class AXPYMKLDNNHandler<float>;
+template class AXPYMKLDNNHandler<plat::bfloat16>;
+
+}  // anonnymouse namespace
+
+template <typename T>
+static void naive_axpy(int n, T alpha, const T *x, T *y) {
+  while (n-- > 0) {
+    *y += alpha * *x;
+    ++y;
+    ++x;
+  }
+}
+
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y) {
+  // fallback to naive version
+  if (n < 100) {
+    naive_axpy(n, alpha, x, y);
+    return;
+  }
+
+  auto &pool = plat::DeviceContextPool::Instance();
+  auto cpu_place = plat::CPUPlace();
+  auto *dev_ctx =
+      dynamic_cast<plat::MKLDNNDeviceContext *>(pool.Get(cpu_place));
+  auto &cpu_engine = dev_ctx->GetEngine();
+
+  AXPYMKLDNNHandler<T> handler(*dev_ctx, cpu_engine, cpu_place, n,
+                               static_cast<float>(alpha));
+
+  auto reorder_src_memory_p = handler.AcquireSrcMemory(x);
+  auto reorder_dst_memory_p = handler.AcquireDstMemory(y);
+  auto reorder_p =
+      handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
+
+  auto &astream = plat::MKLDNNDeviceContext::tls().get_stream();
+  plat::RecordEvent record_reorder("axpy_int_reorder",
+                                   plat::EventRole::kUniqueOp);
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+}
+
+template <>
+void onednn_handler_axpy<double>(int, double, const double *, double *) {
+  PADDLE_THROW(plat::errors::Unavailable(
+      "Currently oneDNN library doesn't support double data type."));
+}
+
+template <>
+void onednn_handler_axpy<plat::complex<double>>(int, plat::complex<double>,
+                                                const plat::complex<double> *,
+                                                plat::complex<double> *) {
+  PADDLE_THROW(plat::errors::Unavailable(
+      "Currently oneDNN library doesn't support complex<double> data type."));
+}
+
+template <>
+void onednn_handler_axpy<plat::complex<float>>(int, plat::complex<float>,
+                                               const plat::complex<float> *,
+                                               plat::complex<float> *) {
+  PADDLE_THROW(plat::errors::Unavailable(
+      "Currently oneDNN library doesn't support complex<float> data type."));
+}
+
+template void onednn_handler_axpy<float>(int, float, const float *, float *);
+template void onednn_handler_axpy<plat::bfloat16>(int, plat::bfloat16,
+                                                  const plat::bfloat16 *,
+                                                  plat::bfloat16 *);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h
new file mode 100644
index 0000000000000..8f0fdeb5c02b4
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+namespace paddle {
+namespace operators {
+
+///
+/// @brief      Helper function to execute AXPY using oneDNN.
+///
+/// @param[in]  n      The number of elements in tensor (assumed 1D)
+/// @param[in]  alpha  The alpha coefficient.
+/// @param[in]  x      The pointer to input X tensor.
+/// @param      y      The pointer to output Y tensor.
+///
+/// @tparam     T      Data type.
+///
+template <typename T>
+void onednn_handler_axpy(int n, T alpha, const T *x, T *y);
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h
deleted file mode 100644
index 4aeb712d02191..0000000000000
--- a/paddle/fluid/platform/mkldnn/axpy_handler.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <cinttypes>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "mkldnn.hpp"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace platform {
-
-template <typename T>
-class AXPYMKLDNNHandler : public MKLDNNHandlerT<T, dnnl::reorder> {
- public:
-  AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
-                    const mkldnn::engine mkldnn_engine, Place cpu_place, int n,
-                    float alpha)
-      : MKLDNNHandlerT<T, dnnl::reorder>(
-            dev_ctx, mkldnn_engine, cpu_place,
-            CreateKey(dev_ctx, static_cast<int64_t>(n), MKLDNNGetDataType<T>(),
-                      alpha, "-axpy")),
-        alpha_(alpha),
-        n_(n) {}
-
-  std::shared_ptr<mkldnn::memory> AcquireMemory(void* ptr,
-                                                const std::string& suffix) {
-    /*Generate key*/
-    auto local_key = this->key_ + suffix;
-    auto mem_p = std::static_pointer_cast<mkldnn::memory>(
-        this->dev_ctx_.GetBlob(local_key));
-    if (mem_p == nullptr) {
-      auto md = mkldnn::memory::desc({n_}, MKLDNNGetDataType<T>(),
-                                     dnnl::memory::format_tag::x);
-      mem_p = std::make_shared<mkldnn::memory>(md, this->engine_, ptr);
-      this->dev_ctx_.SetBlob(local_key, mem_p);
-    } else {
-      mem_p->set_data_handle(ptr);
-    }
-    return mem_p;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(const T* x) {
-    return this->AcquireMemory(to_void_cast(x), "@user_src_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(T* y) {
-    return this->AcquireMemory(y, "@user_dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::reorder> AcquireReorder(
-      std::shared_ptr<mkldnn::memory> dst_memory_p,
-      std::shared_ptr<mkldnn::memory> src_memory_p) {
-    auto prim_key = this->key_ + "@reorder_p";
-    auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
-        this->dev_ctx_.GetBlob(prim_key));
-    if (reorder_p == nullptr) {
-      // Here we pass Postops to mimick y -> a*X + y
-      mkldnn::primitive_attr reorder_attr;
-      mkldnn::post_ops post_operations;
-      if (this->alpha_ != 1.f) {
-        std::vector<float> scales(1, this->alpha_);
-        reorder_attr.set_output_scales(0, scales);
-      }
-      post_operations.append_sum(1.0f);
-
-      reorder_attr.set_post_ops(post_operations);
-      reorder_p = std::make_shared<mkldnn::reorder>(
-          *(src_memory_p), *(dst_memory_p), reorder_attr);
-      this->dev_ctx_.SetBlob(prim_key, reorder_p);
-    }
-    return reorder_p;
-  }
-
- private:
-  float alpha_;
-  int n_;
-};
-
-}  // namespace platform
-}  // namespace paddle

From 1f0677bef13952dfc9b64230169da300aa293fe5 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 15:39:51 +0200
Subject: [PATCH 07/13] Use oneDNN AXPY handler in SGD op.

---
 paddle/fluid/operators/optimizers/sgd_op.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index 076121c0e27da..c175d0c78a1e1 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
 #include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
@@ -139,9 +140,9 @@ struct sgd_dense_param_kernel<
               "Got [%s], but expected less than [%s]",
               grad_rows[i], grad_height));
       const int64_t row = grad_rows[i];
-      for (int64_t j = 0; j < grad_width; ++j) {
-        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
-      }
+      operators::onednn_handler_axpy(grad_width, -lr[0],
+                                     grad_data + i * grad_width,
+                                     out_data + row * grad_width);
     }
   }
 };

From c022f9ca0bddc935e8d553a19c50159d022c2cb0 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 15:46:14 +0200
Subject: [PATCH 08/13] Use axpy handler only when Paddle is built with oneDNN.

---
 paddle/fluid/operators/optimizers/sgd_op.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
index c175d0c78a1e1..076afdc655386 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.h
+++ b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -19,7 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/var_type_traits.h"
 #include "paddle/fluid/operators/jit/kernels.h"
+#ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
+#endif
 #include "paddle/fluid/platform/bfloat16.h"
 
 namespace paddle {
@@ -140,9 +142,15 @@ struct sgd_dense_param_kernel<
               "Got [%s], but expected less than [%s]",
               grad_rows[i], grad_height));
       const int64_t row = grad_rows[i];
+#ifdef PADDLE_WITH_MKLDNN
       operators::onednn_handler_axpy(grad_width, -lr[0],
                                      grad_data + i * grad_width,
                                      out_data + row * grad_width);
+#else
+      for (int64_t j = 0; j < grad_width; ++j) {
+        out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j];
+      }
+#endif
     }
   }
 };

From 5b6d418c2f7c56325cfe587556945ac91b622778 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 15:50:49 +0200
Subject: [PATCH 09/13] Add test for SUM BF16 with big rows.

---
 python/paddle/fluid/tests/unittests/test_sum_op.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index f9e40cf8133d7..6e23c26a3a796 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -211,6 +211,13 @@ def test_w_is_selected_rows(self):
             self.check_with_place(core.CPUPlace(), inplace)
 
 
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op):
+    def init_kernel_type(self):
+        self.row_numel = 102
+
+
 class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp):
     def setUp(self):
         self.height = 10

From 274cdb1ae9ca19dcdee8d2dafa6c0d67240f6154 Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Thu, 10 Jun 2021 16:31:15 +0200
Subject: [PATCH 10/13] Fix SFINAE rules for elementwise_add_to.

---
 .../operators/math/selected_rows_functor.cc   | 23 +++++++++++++++----
 paddle/fluid/operators/mkldnn/axpy_handler.cc | 22 ------------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 59502fa41cfa9..a72bdec05d77f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -300,18 +300,33 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
 // add or mul.
 namespace scatter {
 
+#ifdef PADDLE_WITH_MKLDNN
 template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value ||
+typename std::enable_if<std::is_same<T, float>::value ||
+                        std::is_same<T, platform::bfloat16>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
+  onednn_handler_axpy(data_len, T(1.f), in, out);
+}
+
+template <typename T>
+typename std::enable_if<std::is_same<T, double>::value ||
                         std::is_same<T, platform::complex<float>>::value ||
                         std::is_same<T, platform::complex<double>>::value>::type
 elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
                    const T* in, T* out) {
-#ifdef PADDLE_WITH_MKLDNN
-  onednn_handler_axpy(data_len, T(1.f), in, out);
+  blas->AXPY(data_len, T(1.f), in, out);
+}
 #else
+template <typename T>
+typename std::enable_if<std::is_floating_point<T>::value ||
+                        std::is_same<T, platform::complex<float>>::value ||
+                        std::is_same<T, platform::complex<double>>::value>::type
+elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
+                   const T* in, T* out) {
   blas->AXPY(data_len, T(1.f), in, out);
-#endif
 }
+#endif
 
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc
index a504f7f2c366a..76101f19ab618 100644
--- a/paddle/fluid/operators/mkldnn/axpy_handler.cc
+++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc
@@ -143,28 +143,6 @@ void onednn_handler_axpy(int n, T alpha, const T *x, T *y) {
   astream.wait();
 }
 
-template <>
-void onednn_handler_axpy<double>(int, double, const double *, double *) {
-  PADDLE_THROW(plat::errors::Unavailable(
-      "Currently oneDNN library doesn't support double data type."));
-}
-
-template <>
-void onednn_handler_axpy<plat::complex<double>>(int, plat::complex<double>,
-                                                const plat::complex<double> *,
-                                                plat::complex<double> *) {
-  PADDLE_THROW(plat::errors::Unavailable(
-      "Currently oneDNN library doesn't support complex<double> data type."));
-}
-
-template <>
-void onednn_handler_axpy<plat::complex<float>>(int, plat::complex<float>,
-                                               const plat::complex<float> *,
-                                               plat::complex<float> *) {
-  PADDLE_THROW(plat::errors::Unavailable(
-      "Currently oneDNN library doesn't support complex<float> data type."));
-}
-
 template void onednn_handler_axpy<float>(int, float, const float *, float *);
 template void onednn_handler_axpy<plat::bfloat16>(int, plat::bfloat16,
                                                   const plat::bfloat16 *,

From 2dc3135d22d0a6b5c4e8d0d8a68eb1f20c528aba Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Fri, 11 Jun 2021 09:54:56 +0200
Subject: [PATCH 11/13] Add test case for SGD with big rows.

---
 python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index fa8ff4effcfd3..207bb087db9a9 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -158,6 +158,15 @@ def setup_params(self):
         self.grad_row_numel = 16
 
 
+@unittest.skipIf(not core.supports_bfloat16(),
+                 'place does not support BF16 evaluation')
+class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16):
+    def setup_params(self):
+        self.grad_height = 10
+        self.grad_rows = [0, 4, 7]
+        self.grad_row_numel = 120
+
+
 @unittest.skipIf(not core.supports_bfloat16(),
                  'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16):

From f37be244421b10c9b7789548ad961dd7c3afd2be Mon Sep 17 00:00:00 2001
From: lidanqing-intel <danqing.li@intel.com>
Date: Fri, 18 Jun 2021 04:25:50 +0200
Subject: [PATCH 12/13] update

---
 paddle/fluid/operators/math/blas_impl.h                 | 2 +-
 python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 2 --
 python/paddle/fluid/tests/unittests/test_sum_op.py      | 2 --
 3 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index e2aebc89bec90..eab513e24bc80 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -40,7 +40,6 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y,
     x = x + incx;
   }
 }
-
 }  // namespace detail
 
 template <typename T>
@@ -61,6 +60,7 @@ struct CBlas<platform::bfloat16> {
   static void AXPY(ARGS... args) {
     detail::axpy(args...);
   }
+
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index 207bb087db9a9..bee45cce248f6 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -158,8 +158,6 @@ def setup_params(self):
         self.grad_row_numel = 16
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 'place does not support BF16 evaluation')
 class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16):
     def setup_params(self):
         self.grad_height = 10
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 6e23c26a3a796..f0fbd143c5a77 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -211,8 +211,6 @@ def test_w_is_selected_rows(self):
             self.check_with_place(core.CPUPlace(), inplace)
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 'place does not support BF16 evaluation')
 class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op):
     def init_kernel_type(self):
         self.row_numel = 102

From 435154de26f947e92468c7c7a73a4e9c9f1f5916 Mon Sep 17 00:00:00 2001
From: lidanqing-intel <danqing.li@intel.com>
Date: Mon, 21 Jun 2021 02:19:47 +0200
Subject: [PATCH 13/13] update

---
 python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
index bee45cce248f6..e60b04257dbbd 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py
@@ -201,8 +201,6 @@ def test_sparse_param_grad_sgd(self):
         self.check_output(output, reference, atol=5e-3, rtol=1e-1)
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 'place does not support BF16 evaluation')
 class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16):
     def setup_params(self):
         self.grad_height = 14