From 2296053a3ac1b3a9dee33b6a83a91d18b042f76e Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Fri, 21 May 2021 12:07:35 +0200 Subject: [PATCH 01/13] Add oneDNN AXPY handler. --- paddle/fluid/operators/math/blas_impl.h | 46 ++++++++++- paddle/fluid/platform/mkldnn/axpy_handler.h | 92 +++++++++++++++++++++ 2 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 paddle/fluid/platform/mkldnn/axpy_handler.h diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 05d42f02c1003..5c2ae63ccf9da 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -21,6 +21,11 @@ #include #include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn/axpy_handler.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" @@ -41,6 +46,40 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y, x = x + incx; } } + +#ifdef PADDLE_WITH_MKLDNN + +static void onednn_handler_axpy(int n, platform::bfloat16 alpha, + const platform::bfloat16 *x, int incx, + platform::bfloat16 *y, int incy) { + // TODO(jczaja): support other increments values diffrent from 1 + PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented( + "Blas AXPY support incx == 1 only")); + PADDLE_ENFORCE_EQ(incy, 1, platform::errors::Unimplemented( + "Blas AXPY support incy == 1 only")); + + auto &pool = platform::DeviceContextPool::Instance(); + auto cpu_place = platform::CPUPlace(); + auto *dev_ctx = + dynamic_cast(pool.Get(cpu_place)); + auto &cpu_engine = dev_ctx->GetEngine(); + + platform::AXPYMKLDNNHandler handler( + *dev_ctx, cpu_engine, cpu_place, n, static_cast(alpha)); + + auto reorder_src_memory_p = handler.AcquireSrcMemory(x); + auto reorder_dst_memory_p = handler.AcquireDstMemory(y); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + + auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("axpy_int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); +} +#endif + } // namespace detail template @@ -57,11 +96,14 @@ struct CBlas { template <> struct CBlas { +#ifdef PADDLE_WITH_MKLDNN template static void AXPY(ARGS... args) { - detail::axpy(args...); + detail::onednn_handler_axpy(args...); +#else + detail::axpy(args...); +#endif } - template static void VCOPY(ARGS... args) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h new file mode 100644 index 0000000000000..d970c571c8318 --- /dev/null +++ b/paddle/fluid/platform/mkldnn/axpy_handler.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "mkldnn.hpp" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace platform { + +template +class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + AXPYMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, int n, float alpha) + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + platform::CreateKey(dev_ctx, static_cast(n), + platform::MKLDNNGetDataType(), alpha, + "-axpy")), + alpha_(alpha), + n_(n) {} + + std::shared_ptr AcquireMemory(void* ptr, + const std::string& suffix) { + /*Generate key*/ + auto local_key = this->key_ + suffix; + auto mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType(), + dnnl::memory::format_tag::x); + mem_p = std::make_shared(md, this->engine_, ptr); + this->dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + std::shared_ptr AcquireSrcMemory(const T* x) { + return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p"); + } + + std::shared_ptr AcquireDstMemory(T* y) { + return this->AcquireMemory(y, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + auto prim_key = this->key_ + "@reorder_p"; + auto reorder_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + // Here we pass Postops to mimick y -> a*X + y + mkldnn::primitive_attr reorder_attr; + mkldnn::post_ops post_operations; + if (this->alpha_ != 1.f) { + std::vector scales(1, this->alpha_); + reorder_attr.set_output_scales(0, scales); + } + post_operations.append_sum(1.0f); + + reorder_attr.set_post_ops(post_operations); + reorder_p = std::make_shared( + *(src_memory_p), *(dst_memory_p), reorder_attr); + this->dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + + private: + float alpha_; + int n_; +}; + +} // namespace platform +} // namespace paddle From 3b7b23a7f7ffe5b05e5083340d188362263f4384 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Wed, 9 Jun 2021 19:04:39 +0200 Subject: [PATCH 02/13] Add fallback for small tensors. --- paddle/fluid/operators/math/blas_impl.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 693f17cf3d535..75005881664ec 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -51,6 +51,12 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y, static void onednn_handler_axpy(int n, platform::bfloat16 alpha, const platform::bfloat16 *x, int incx, platform::bfloat16 *y, int incy) { + // fallback to naive version + if (n < 100) { + axpy(n, alpha, x, incx, y, incy); + return; + } + // TODO(jczaja): support other increments values diffrent from 1 PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented( "Blas AXPY support incx == 1 only")); From b2d6dfefd02231e3ff7a8c3f5936cecf0809a885 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Wed, 9 Jun 2021 19:05:17 +0200 Subject: [PATCH 03/13] Fix ifdefs --- paddle/fluid/operators/math/blas_impl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 75005881664ec..f713cdf21b898 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -101,12 +101,12 @@ struct CBlas { template <> struct CBlas { -#ifdef PADDLE_WITH_MKLDNN template static void AXPY(ARGS... args) { +#ifdef PADDLE_WITH_MKLDNN detail::onednn_handler_axpy(args...); #else - detail::axpy(args...); + detail::axpy(args...); #endif } template From 425a76b0c91b98388a32b3510dc9a5159d3f64e9 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 10:37:12 +0200 Subject: [PATCH 04/13] Remove unnecessary namespace prefixes and add missing headers. --- paddle/fluid/platform/mkldnn/axpy_handler.h | 25 ++++++++++++--------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h index d970c571c8318..4aeb712d02191 100644 --- a/paddle/fluid/platform/mkldnn/axpy_handler.h +++ b/paddle/fluid/platform/mkldnn/axpy_handler.h @@ -13,24 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include + #include "mkldnn.hpp" #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace platform { template -class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT { +class AXPYMKLDNNHandler : public MKLDNNHandlerT { public: - AXPYMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, int n, float alpha) - : platform::MKLDNNHandlerT( + AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, Place cpu_place, int n, + float alpha) + : MKLDNNHandlerT( dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, static_cast(n), - platform::MKLDNNGetDataType(), alpha, - "-axpy")), + CreateKey(dev_ctx, static_cast(n), MKLDNNGetDataType(), + alpha, "-axpy")), alpha_(alpha), n_(n) {} @@ -41,7 +46,7 @@ class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT { auto mem_p = std::static_pointer_cast( this->dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto md = mkldnn::memory::desc({n_}, platform::MKLDNNGetDataType(), + auto md = mkldnn::memory::desc({n_}, MKLDNNGetDataType(), dnnl::memory::format_tag::x); mem_p = std::make_shared(md, this->engine_, ptr); this->dev_ctx_.SetBlob(local_key, mem_p); @@ -52,7 +57,7 @@ class AXPYMKLDNNHandler : public platform::MKLDNNHandlerT { } std::shared_ptr AcquireSrcMemory(const T* x) { - return this->AcquireMemory(platform::to_void_cast(x), "@user_src_mem_p"); + return this->AcquireMemory(to_void_cast(x), "@user_src_mem_p"); } std::shared_ptr AcquireDstMemory(T* y) { From 62b1ada8c4729c7d60e89218f4dc0b307ee9be25 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 10:39:04 +0200 Subject: [PATCH 05/13] Guard handler_axpy with proper ifdefs. * Compilation of this function is possible only when Paddle is not build with CUDA nor HIP. --- paddle/fluid/operators/math/blas_impl.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index f713cdf21b898..89f1c2df6c691 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -21,9 +21,11 @@ #include #include -#ifdef PADDLE_WITH_MKLDNN +#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn/axpy_handler.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/profiler.h" #endif #include "paddle/fluid/operators/math/math_function.h" @@ -46,11 +48,11 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y, } } -#ifdef PADDLE_WITH_MKLDNN - -static void onednn_handler_axpy(int n, platform::bfloat16 alpha, - const platform::bfloat16 *x, int incx, - platform::bfloat16 *y, int incy) { +#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) +template +static void onednn_handler_axpy(int n, T alpha, const T *x, int incx, T *y, + int incy) { // fallback to naive version if (n < 100) { axpy(n, alpha, x, incx, y, incy); @@ -69,8 +71,8 @@ static void onednn_handler_axpy(int n, platform::bfloat16 alpha, dynamic_cast(pool.Get(cpu_place)); auto &cpu_engine = dev_ctx->GetEngine(); - platform::AXPYMKLDNNHandler handler( - *dev_ctx, cpu_engine, cpu_place, n, static_cast(alpha)); + platform::AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, + static_cast(alpha)); auto reorder_src_memory_p = handler.AcquireSrcMemory(x); auto reorder_dst_memory_p = handler.AcquireDstMemory(y); @@ -103,7 +105,8 @@ template <> struct CBlas { template static void AXPY(ARGS... args) { -#ifdef PADDLE_WITH_MKLDNN +#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ + !defined(PADDLE_WITH_HIP) detail::onednn_handler_axpy(args...); #else detail::axpy(args...); From 6c28aa8e9d61de124d5505754f41fb01f416763d Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 15:13:37 +0200 Subject: [PATCH 06/13] Move AXPY handler code to separate files. --- paddle/fluid/operators/CMakeLists.txt | 3 + paddle/fluid/operators/math/CMakeLists.txt | 8 +- paddle/fluid/operators/math/blas_impl.h | 51 ----- .../operators/math/selected_rows_functor.cc | 8 + paddle/fluid/operators/mkldnn/CMakeLists.txt | 1 + paddle/fluid/operators/mkldnn/axpy_handler.cc | 174 ++++++++++++++++++ paddle/fluid/operators/mkldnn/axpy_handler.h | 33 ++++ paddle/fluid/platform/mkldnn/axpy_handler.h | 97 ---------- 8 files changed, 226 insertions(+), 149 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/CMakeLists.txt create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.cc create mode 100644 paddle/fluid/operators/mkldnn/axpy_handler.h delete mode 100644 paddle/fluid/platform/mkldnn/axpy_handler.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e645b379f3c06..5cfc0144c6bdf 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -20,6 +20,9 @@ add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(jit) +if(WITH_MKLDNN) + add_subdirectory(mkldnn) +endif() if(WITH_DISTRIBUTE) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index fdbc0c68525ba..a13fffe15cf24 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows math_function blas) + +if(WITH_MKLDNN) + math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler) +else() + math_library(selected_rows_functor DEPS selected_rows math_function blas) +endif() + math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 89f1c2df6c691..e2aebc89bec90 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -21,13 +21,6 @@ #include #include -#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/mkldnn/axpy_handler.h" -#include "paddle/fluid/platform/profiler.h" -#endif - #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" @@ -48,45 +41,6 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y, } } -#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) -template -static void onednn_handler_axpy(int n, T alpha, const T *x, int incx, T *y, - int incy) { - // fallback to naive version - if (n < 100) { - axpy(n, alpha, x, incx, y, incy); - return; - } - - // TODO(jczaja): support other increments values diffrent from 1 - PADDLE_ENFORCE_EQ(incx, 1, platform::errors::Unimplemented( - "Blas AXPY support incx == 1 only")); - PADDLE_ENFORCE_EQ(incy, 1, platform::errors::Unimplemented( - "Blas AXPY support incy == 1 only")); - - auto &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto *dev_ctx = - dynamic_cast(pool.Get(cpu_place)); - auto &cpu_engine = dev_ctx->GetEngine(); - - platform::AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, - static_cast(alpha)); - - auto reorder_src_memory_p = handler.AcquireSrcMemory(x); - auto reorder_dst_memory_p = handler.AcquireDstMemory(y); - auto reorder_p = - handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); - - auto &astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("axpy_int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); - astream.wait(); -} -#endif - } // namespace detail template @@ -105,12 +59,7 @@ template <> struct CBlas { template static void AXPY(ARGS... args) { -#if defined(PADDLE_WITH_MKLDNN) && !defined(PADDLE_WITH_CUDA) && \ - !defined(PADDLE_WITH_HIP) - detail::onednn_handler_axpy(args...); -#else detail::axpy(args...); -#endif } template static void VCOPY(ARGS... args) { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index ee405be5ae9a6..59502fa41cfa9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif + namespace paddle { namespace operators { namespace math { @@ -302,7 +306,11 @@ typename std::enable_if::value || std::is_same>::value>::type elementwise_add_to(BlasT* blas, size_t data_len, const T* in, T* out) { +#ifdef PADDLE_WITH_MKLDNN + onednn_handler_axpy(data_len, T(1.f), in, out); +#else blas->AXPY(data_len, T(1.f), in, out); +#endif } template diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt new file mode 100644 index 0000000000000..ce95ec560c25e --- /dev/null +++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc new file mode 100644 index 0000000000000..a504f7f2c366a --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +namespace plat = paddle::platform; + +namespace { + +template +class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT { + public: + AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx, + const dnnl::engine mkldnn_engine, plat::Place cpu_place, + int n, float alpha) + : plat::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + plat::CreateKey(dev_ctx, static_cast(n), + plat::MKLDNNGetDataType(), alpha, "-axpy")), + alpha_(alpha), + n_(n) {} + + std::shared_ptr AcquireMemory(void *ptr, + const std::string &suffix) { + /*Generate key*/ + auto local_key = this->key_ + suffix; + auto mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType(), + dnnl::memory::format_tag::x); + mem_p = std::make_shared(md, this->engine_, ptr); + this->dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + std::shared_ptr AcquireSrcMemory(const T *x) { + return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p"); + } + + std::shared_ptr AcquireDstMemory(T *y) { + return this->AcquireMemory(y, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + auto prim_key = this->key_ + "@reorder_p"; + auto reorder_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + // Here we pass Postops to mimick y -> a*X + y + dnnl::primitive_attr reorder_attr; + dnnl::post_ops post_operations; + if (this->alpha_ != 1.f) { + std::vector scales(1, this->alpha_); + reorder_attr.set_output_scales(0, scales); + } + post_operations.append_sum(1.0f); + + reorder_attr.set_post_ops(post_operations); + reorder_p = std::make_shared( + *(src_memory_p), *(dst_memory_p), reorder_attr); + this->dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + + private: + float alpha_; + int n_; +}; + +template class AXPYMKLDNNHandler; +template class AXPYMKLDNNHandler; + +} // anonnymouse namespace + +template +static void naive_axpy(int n, T alpha, const T *x, T *y) { + while (n-- > 0) { + *y += alpha * *x; + ++y; + ++x; + } +} + +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y) { + // fallback to naive version + if (n < 100) { + naive_axpy(n, alpha, x, y); + return; + } + + auto &pool = plat::DeviceContextPool::Instance(); + auto cpu_place = plat::CPUPlace(); + auto *dev_ctx = + dynamic_cast(pool.Get(cpu_place)); + auto &cpu_engine = dev_ctx->GetEngine(); + + AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, + static_cast(alpha)); + + auto reorder_src_memory_p = handler.AcquireSrcMemory(x); + auto reorder_dst_memory_p = handler.AcquireDstMemory(y); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + + auto &astream = plat::MKLDNNDeviceContext::tls().get_stream(); + plat::RecordEvent record_reorder("axpy_int_reorder", + plat::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); +} + +template <> +void onednn_handler_axpy(int, double, const double *, double *) { + PADDLE_THROW(plat::errors::Unavailable( + "Currently oneDNN library doesn't support double data type.")); +} + +template <> +void onednn_handler_axpy>(int, plat::complex, + const plat::complex *, + plat::complex *) { + PADDLE_THROW(plat::errors::Unavailable( + "Currently oneDNN library doesn't support complex data type.")); +} + +template <> +void onednn_handler_axpy>(int, plat::complex, + const plat::complex *, + plat::complex *) { + PADDLE_THROW(plat::errors::Unavailable( + "Currently oneDNN library doesn't support complex data type.")); +} + +template void onednn_handler_axpy(int, float, const float *, float *); +template void onednn_handler_axpy(int, plat::bfloat16, + const plat::bfloat16 *, + plat::bfloat16 *); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h new file mode 100644 index 0000000000000..8f0fdeb5c02b4 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +namespace paddle { +namespace operators { + +/// +/// @brief Helper function to execute AXPY using oneDNN. +/// +/// @param[in] n The number of elements in tensor (assumed 1D) +/// @param[in] alpha The alpha coefficient. +/// @param[in] x The pointer to input X tensor. +/// @param y The pointer to output Y tensor. +/// +/// @tparam T Data type. +/// +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/mkldnn/axpy_handler.h b/paddle/fluid/platform/mkldnn/axpy_handler.h deleted file mode 100644 index 4aeb712d02191..0000000000000 --- a/paddle/fluid/platform/mkldnn/axpy_handler.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include -#include - -#include "mkldnn.hpp" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/mkldnn_helper.h" -#include "paddle/fluid/platform/mkldnn_reuse.h" - -namespace paddle { -namespace platform { - -template -class AXPYMKLDNNHandler : public MKLDNNHandlerT { - public: - AXPYMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, Place cpu_place, int n, - float alpha) - : MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - CreateKey(dev_ctx, static_cast(n), MKLDNNGetDataType(), - alpha, "-axpy")), - alpha_(alpha), - n_(n) {} - - std::shared_ptr AcquireMemory(void* ptr, - const std::string& suffix) { - /*Generate key*/ - auto local_key = this->key_ + suffix; - auto mem_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto md = mkldnn::memory::desc({n_}, MKLDNNGetDataType(), - dnnl::memory::format_tag::x); - mem_p = std::make_shared(md, this->engine_, ptr); - this->dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireSrcMemory(const T* x) { - return this->AcquireMemory(to_void_cast(x), "@user_src_mem_p"); - } - - std::shared_ptr AcquireDstMemory(T* y) { - return this->AcquireMemory(y, "@user_dst_mem_p"); - } - - std::shared_ptr AcquireReorder( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - auto prim_key = this->key_ + "@reorder_p"; - auto reorder_p = std::static_pointer_cast( - this->dev_ctx_.GetBlob(prim_key)); - if (reorder_p == nullptr) { - // Here we pass Postops to mimick y -> a*X + y - mkldnn::primitive_attr reorder_attr; - mkldnn::post_ops post_operations; - if (this->alpha_ != 1.f) { - std::vector scales(1, this->alpha_); - reorder_attr.set_output_scales(0, scales); - } - post_operations.append_sum(1.0f); - - reorder_attr.set_post_ops(post_operations); - reorder_p = std::make_shared( - *(src_memory_p), *(dst_memory_p), reorder_attr); - this->dev_ctx_.SetBlob(prim_key, reorder_p); - } - return reorder_p; - } - - private: - float alpha_; - int n_; -}; - -} // namespace platform -} // namespace paddle From 1f0677bef13952dfc9b64230169da300aa293fe5 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 15:39:51 +0200 Subject: [PATCH 07/13] Use oneDNN AXPY handler in SGD op. --- paddle/fluid/operators/optimizers/sgd_op.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 076121c0e27da..c175d0c78a1e1 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" #include "paddle/fluid/platform/bfloat16.h" namespace paddle { @@ -139,9 +140,9 @@ struct sgd_dense_param_kernel< "Got [%s], but expected less than [%s]", grad_rows[i], grad_height)); const int64_t row = grad_rows[i]; - for (int64_t j = 0; j < grad_width; ++j) { - out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; - } + operators::onednn_handler_axpy(grad_width, -lr[0], + grad_data + i * grad_width, + out_data + row * grad_width); } } }; From c022f9ca0bddc935e8d553a19c50159d022c2cb0 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 15:46:14 +0200 Subject: [PATCH 08/13] Use axpy handler only when Paddle is built with oneDNN. --- paddle/fluid/operators/optimizers/sgd_op.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index c175d0c78a1e1..076afdc655386 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -19,7 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" +#ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif #include "paddle/fluid/platform/bfloat16.h" namespace paddle { @@ -140,9 +142,15 @@ struct sgd_dense_param_kernel< "Got [%s], but expected less than [%s]", grad_rows[i], grad_height)); const int64_t row = grad_rows[i]; +#ifdef PADDLE_WITH_MKLDNN operators::onednn_handler_axpy(grad_width, -lr[0], grad_data + i * grad_width, out_data + row * grad_width); +#else + for (int64_t j = 0; j < grad_width; ++j) { + out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; + } +#endif } } }; From 5b6d418c2f7c56325cfe587556945ac91b622778 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 15:50:49 +0200 Subject: [PATCH 09/13] Add test for SUM BF16 with big rows. --- python/paddle/fluid/tests/unittests/test_sum_op.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index f9e40cf8133d7..6e23c26a3a796 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -211,6 +211,13 @@ def test_w_is_selected_rows(self): self.check_with_place(core.CPUPlace(), inplace) +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op): + def init_kernel_type(self): + self.row_numel = 102 + + class TestLoDTensorAndSelectedRowsOp(TestSelectedRowsSumOp): def setUp(self): self.height = 10 From 274cdb1ae9ca19dcdee8d2dafa6c0d67240f6154 Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Thu, 10 Jun 2021 16:31:15 +0200 Subject: [PATCH 10/13] Fix SFINAE rules for elementwise_add_to. --- .../operators/math/selected_rows_functor.cc | 23 +++++++++++++++---- paddle/fluid/operators/mkldnn/axpy_handler.cc | 22 ------------------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 59502fa41cfa9..a72bdec05d77f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -300,18 +300,33 @@ template struct SelectedRowsAddToTensor -typename std::enable_if::value || +typename std::enable_if::value || + std::is_same::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + onednn_handler_axpy(data_len, T(1.f), in, out); +} + +template +typename std::enable_if::value || std::is_same>::value || std::is_same>::value>::type elementwise_add_to(BlasT* blas, size_t data_len, const T* in, T* out) { -#ifdef PADDLE_WITH_MKLDNN - onednn_handler_axpy(data_len, T(1.f), in, out); + blas->AXPY(data_len, T(1.f), in, out); +} #else +template +typename std::enable_if::value || + std::is_same>::value || + std::is_same>::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { blas->AXPY(data_len, T(1.f), in, out); -#endif } +#endif template typename std::enable_if::value>::type elementwise_add_to( diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index a504f7f2c366a..76101f19ab618 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -143,28 +143,6 @@ void onednn_handler_axpy(int n, T alpha, const T *x, T *y) { astream.wait(); } -template <> -void onednn_handler_axpy(int, double, const double *, double *) { - PADDLE_THROW(plat::errors::Unavailable( - "Currently oneDNN library doesn't support double data type.")); -} - -template <> -void onednn_handler_axpy>(int, plat::complex, - const plat::complex *, - plat::complex *) { - PADDLE_THROW(plat::errors::Unavailable( - "Currently oneDNN library doesn't support complex data type.")); -} - -template <> -void onednn_handler_axpy>(int, plat::complex, - const plat::complex *, - plat::complex *) { - PADDLE_THROW(plat::errors::Unavailable( - "Currently oneDNN library doesn't support complex data type.")); -} - template void onednn_handler_axpy(int, float, const float *, float *); template void onednn_handler_axpy(int, plat::bfloat16, const plat::bfloat16 *, From 2dc3135d22d0a6b5c4e8d0d8a68eb1f20c528aba Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Fri, 11 Jun 2021 09:54:56 +0200 Subject: [PATCH 11/13] Add test case for SGD with big rows. --- python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index fa8ff4effcfd3..207bb087db9a9 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -158,6 +158,15 @@ def setup_params(self): self.grad_row_numel = 16 +@unittest.skipIf(not core.supports_bfloat16(), + 'place does not support BF16 evaluation') +class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16): + def setup_params(self): + self.grad_height = 10 + self.grad_rows = [0, 4, 7] + self.grad_row_numel = 120 + + @unittest.skipIf(not core.supports_bfloat16(), 'place does not support BF16 evaluation') class TestSparseGradParamSGDOpBF16(TestSparseSGDOpBF16): From f37be244421b10c9b7789548ad961dd7c3afd2be Mon Sep 17 00:00:00 2001 From: lidanqing-intel Date: Fri, 18 Jun 2021 04:25:50 +0200 Subject: [PATCH 12/13] update --- paddle/fluid/operators/math/blas_impl.h | 2 +- python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 2 -- python/paddle/fluid/tests/unittests/test_sum_op.py | 2 -- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index e2aebc89bec90..eab513e24bc80 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -40,7 +40,6 @@ static void axpy(int n, const T alpha, const T *x, const int incx, T *y, x = x + incx; } } - } // namespace detail template @@ -61,6 +60,7 @@ struct CBlas { static void AXPY(ARGS... args) { detail::axpy(args...); } + template static void VCOPY(ARGS... args) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index 207bb087db9a9..bee45cce248f6 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -158,8 +158,6 @@ def setup_params(self): self.grad_row_numel = 16 -@unittest.skipIf(not core.supports_bfloat16(), - 'place does not support BF16 evaluation') class TestSparseGradSGDOpBF16Case3(TestSparseGradSGDOpBF16): def setup_params(self): self.grad_height = 10 diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py index 6e23c26a3a796..f0fbd143c5a77 100644 --- a/python/paddle/fluid/tests/unittests/test_sum_op.py +++ b/python/paddle/fluid/tests/unittests/test_sum_op.py @@ -211,8 +211,6 @@ def test_w_is_selected_rows(self): self.check_with_place(core.CPUPlace(), inplace) -@unittest.skipIf(not core.supports_bfloat16(), - 'place does not support BF16 evaluation') class TestSelectedRowsSumBF16OpBigRow(TestSelectedRowsSumBF16Op): def init_kernel_type(self): self.row_numel = 102 From 435154de26f947e92468c7c7a73a4e9c9f1f5916 Mon Sep 17 00:00:00 2001 From: lidanqing-intel Date: Mon, 21 Jun 2021 02:19:47 +0200 Subject: [PATCH 13/13] update --- python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py index bee45cce248f6..e60b04257dbbd 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op_bf16.py @@ -201,8 +201,6 @@ def test_sparse_param_grad_sgd(self): self.check_output(output, reference, atol=5e-3, rtol=1e-1) -@unittest.skipIf(not core.supports_bfloat16(), - 'place does not support BF16 evaluation') class TestSparseGradParamSGDOpBF16Case2(TestSparseGradParamSGDOpBF16): def setup_params(self): self.grad_height = 14