Skip to content

Commit

Permalink
[Pten] blas and lapck migration (#39587)
Browse files Browse the repository at this point in the history
* move blas related files
* move lapack related files
  • Loading branch information
Feiyu Chan committed Feb 18, 2022
1 parent 1d6fd81 commit 8c7ee8c
Show file tree
Hide file tree
Showing 134 changed files with 9,215 additions and 6,058 deletions.
39 changes: 39 additions & 0 deletions cmake/generic.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -1036,3 +1036,42 @@ function(generate_dummy_static_lib)
add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
endfunction()

function(math_library TARGET)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set(cc_srcs)
set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})

if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()

list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
endfunction()

8 changes: 3 additions & 5 deletions paddle/fluid/distributed/common/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,16 @@
#include <utility>
#include <vector>

#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace distributed {

template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
GetBlas() {
inline pten::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
T>(cpu_ctx);
return pten::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
}

template <typename T>
Expand Down
14 changes: 4 additions & 10 deletions paddle/fluid/distributed/ps/service/communicator/communicator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1161,8 +1161,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
blas.VSUB(t_latest.numel(), t_latest.data<float>(),
t_timestamp->data<float>(), t_delta->data<float>());

Expand Down Expand Up @@ -1201,8 +1200,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(),
t_delta->data<float>());
blas.VADD(t_latest->numel(), t_latest->data<float>(),
Expand Down Expand Up @@ -1303,9 +1301,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
t_delta->set_rows(sparse_ids);
t_delta->set_height(t_latest.dims()[0]);

auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
float coefficient = 1.0 / static_cast<float>(trainers_);

std::vector<float *> push_g_vec;
Expand Down Expand Up @@ -1371,9 +1367,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
v_delta.resize(numel);

paddle::platform::CPUDeviceContext cpu_ctx;
auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);

for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

#include "paddle/fluid/distributed/ps/service/ps_client.h"
Expand Down
10 changes: 5 additions & 5 deletions paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
#include <string>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace framework {
Expand Down Expand Up @@ -121,14 +121,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,

// broadcast biases
std::vector<float> ones(m, 1.0f);
paddle::operators::math::CBlas<float>::GEMM(
pten::funcs::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
&combined_biases[0], n, 0.0f, embeddings_data, n);

// Wx*embeddings + biases
paddle::operators::math::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
embedding_data, k, weightx_data, n, beta, embeddings_data, n);
pten::funcs::CBlas<float>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, k, alpha, embedding_data, k,
weightx_data, n, beta, embeddings_data, n);
op_desc.SetInput("Embeddings", {embeddings});

op_desc.SetInput("H0", {});
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/imperative/gradient_accumulator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_XPU
#include "xpu/refactor/math.h"
Expand Down Expand Up @@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
void operator()(const platform::CPUPlace& place) const {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}

Expand Down Expand Up @@ -118,7 +118,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_);
}
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
#include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace inference {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/activation_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/addmm_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

namespace ops = paddle::operators;
Expand Down Expand Up @@ -94,7 +94,7 @@ class AddMMKernel : public framework::OpKernel<T> {
float alpha = context.template Attr<float>("Alpha");
float beta = context.template Attr<float>("Beta");

auto blas = math::GetBlas<DeviceContext, T>(context);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);

// calc broadcast dim
Array2 bcast_dims;
Expand Down Expand Up @@ -146,7 +146,7 @@ class AddMMGradKernel : public framework::OpKernel<T> {
}

auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dinput) {
dinput->mutable_data<T>(ctx.GetPlace());
total_elems = in_dims[0] * in_dims[1];
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/affine_grid_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h"

namespace paddle {
Expand Down Expand Up @@ -122,7 +122,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Expand Down Expand Up @@ -165,7 +165,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/atan2_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/attention_lstm_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ limitations under the License. */

#include "paddle/fluid/operators/attention_lstm_op.h"
#include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -373,7 +373,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());

auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);

// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/batch_fc_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -112,7 +112,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim;

auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data,
w_data, beta, out_data, slot_pairs_num, strideA, strideB);
add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num,
Expand Down Expand Up @@ -165,7 +165,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data,
slot_pairs_num, ins_num, out_dim, db_data);

auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1;
T beta = 0;

Expand Down
6 changes: 3 additions & 3 deletions paddle/fluid/operators/bilinear_tensor_product_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License. */

#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -61,7 +61,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
auto output_col_vec = output_mat.chip(i, 1);
Tensor weight_mat =
weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
pten::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
weight_mat.data<T>(), 0, left_mul.data<T>());
output_col_vec.device(place) =
Expand Down Expand Up @@ -127,7 +127,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
d_weight->mutable_data<T>(ctx.GetPlace());
}

auto blas = math::GetBlas<DeviceContext, T>(ctx);
auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);

// Caculate the Output(X@Grad) and Output(Y@Grad).
if (d_x || d_y || d_weight) {
Expand Down
Loading

0 comments on commit 8c7ee8c

Please sign in to comment.