Skip to content

Commit

Permalink
Optimize memcpy operation in Eigh (#42853)
Browse files Browse the repository at this point in the history
* 1st commit

* fix usless change in header transpose_kernel_h file

* add sync
  • Loading branch information
JamesLim-sy committed May 30, 2022
1 parent 3591a25 commit 806073d
Showing 1 changed file with 50 additions and 36 deletions.
86 changes: 50 additions & 36 deletions paddle/phi/kernels/funcs/values_vectors_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
namespace phi {
namespace funcs {

inline int64_t GetBatchSize(phi::DDim dims) {
inline int64_t GetBatchSize(const phi::DDim &dims) {
int64_t batch_size = 1;
auto dim_size = dims.size();
for (int i = 0; i < dim_size - 2; i++) {
for (int i = 0; i < dim_size - 2; ++i) {
batch_size *= dims[i];
}
return batch_size;
Expand All @@ -54,6 +54,24 @@ static void CheckEighResult(const int batch, const int info) {
info));
}

#ifdef PADDLE_WITH_CUDA
static void CheckEighResult(const GPUContext &dev_ctx,
const int64_t batch_size,
int *info) {
std::vector<int> error_info(batch_size);
paddle::memory::Copy(phi::CPUPlace(),
error_info.data(),
dev_ctx.GetPlace(),
info,
sizeof(int) * batch_size,
dev_ctx.stream());
dev_ctx.Wait();
for (auto i = 0; i < batch_size; ++i) {
CheckEighResult(i, error_info[i]);
}
}
#endif

template <typename DeviceContext, typename T>
struct MatrixEighFunctor {
void operator()(const DeviceContext &dev_ctx,
Expand Down Expand Up @@ -95,7 +113,8 @@ struct MatrixEighFunctor<CPUContext, T> {
char jobz = has_vectors ? 'V' : 'N';
int n = dims[dim_size - 1];
int64_t lda = std::max<int64_t>(1, n);
// if work = -1, it means that you need to use the lapack function to query
// if work = -1, it means that you need to use the lapack function to
// query
// the optimal value
int lwork = -1; // The length of the array work
int lrwork = -1; // The dimension of the array rwork,rwork is REAL array
Expand Down Expand Up @@ -188,97 +207,92 @@ struct MatrixEighFunctor<GPUContext, T> {
bool is_lower,
bool has_vectors) {
using ValueType = phi::dtype::Real<T>;
ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);

DenseTensor input_trans;
input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
T *input_vector = input_trans.data<T>();
int workspace_size = 0;
auto &dims = input.dims();
int dim_size = dims.size();
int64_t batch_size = GetBatchSize(dims);
int last_dim = dims[dim_size - 1];
int lda = std::max<int>(1, last_dim);
auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
auto values_stride = dims[dim_size - 1];

cublasFillMode_t uplo =
is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
cusolverEigMode_t jobz =
has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;

int n = dims[dim_size - 1];
int lda = std::max<int>(1, n);
auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
auto values_stride = dims[dim_size - 1];
int lwork = 0;
ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
auto *info_ptr = reinterpret_cast<int *>(info->ptr());

// When the input type is float32, and the feature value input dimension
// is greater than or equal to [*,32,32] and less than or equal to
// [*,512,512], Syevj has better performance.
DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
T *input_vector = input_trans.data<T>();

// Once input data type is float32, and the last dimension of
// input is located in range [32, 512], Syevj works better.
bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
values_stride >= 32 && values_stride <= 512);
auto handle = dev_ctx.cusolver_dn_handle();

syevjInfo_t syevj_params;
if (use_syevj) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cusolverDnCreateSyevjInfo(&syevj_params));

PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
dev_ctx.cusolver_dn_handle(),
jobz,
uplo,
n,
last_dim,
reinterpret_cast<const float *>(input_vector),
lda,
reinterpret_cast<const float *>(out_value),
&lwork,
&workspace_size,
syevj_params));
} else {
EvdBuffer(dev_ctx.cusolver_dn_handle(),
jobz,
uplo,
n,
last_dim,
input_vector,
lda,
out_value,
&lwork);
&workspace_size);
}
auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
auto *work_ptr = reinterpret_cast<T *>(work->ptr());
for (auto i = 0; i < batch_size; i++) {

for (auto i = 0; i < batch_size; ++i) {
auto *input_data = input_vector + i * vector_stride;
auto *value_data = out_value + i * values_stride;
auto handle = dev_ctx.cusolver_dn_handle();
if (use_syevj) {
PADDLE_ENFORCE_GPU_SUCCESS(
dynload::cusolverDnSsyevj(handle,
jobz,
uplo,
n,
last_dim,
reinterpret_cast<float *>(input_data),
lda,
reinterpret_cast<float *>(value_data),
reinterpret_cast<float *>(work_ptr),
lwork,
info_ptr,
workspace_size,
&info_ptr[i],
syevj_params));
} else {
Evd(handle,
jobz,
uplo,
n,
last_dim,
input_data,
lda,
value_data,
work_ptr,
lwork,
info_ptr);
workspace_size,
&info_ptr[i]);
}
int error_info = 0;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info_ptr,
sizeof(int),
dev_ctx.stream());
CheckEighResult(i, error_info);
}
CheckEighResult(dev_ctx, batch_size, info_ptr);

if (use_syevj) {
PADDLE_ENFORCE_GPU_SUCCESS(
Expand Down

0 comments on commit 806073d

Please sign in to comment.