Skip to content

Commit

Permalink
CPU forward calculation replaces Eigen with Lapack
Browse files Browse the repository at this point in the history
  • Loading branch information
Zjq9409 committed Sep 22, 2021
1 parent 1238115 commit 9e4cf0d
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 107 deletions.
167 changes: 64 additions & 103 deletions paddle/fluid/operators/math/eigen_values_vectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "Eigen/Core"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/svd_helper.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cusolver.h"
Expand All @@ -25,84 +26,6 @@ namespace paddle {
namespace operators {
namespace math {

template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using InputMatrixMap = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;

template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using OutputMatrixMap = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;

template <typename ValueType>
inline void ComputeFloatEigenvaluesAndVectors(ValueType *x_data,
ValueType *eigenvalues_data,
ValueType *eigenvectors_data,
int batches, int rows, int cols,
bool has_vectors) {
int stride = rows * cols;
for (int i = 0; i < batches; i++) {
auto m = InputMatrixMap<ValueType>(x_data + i * stride, rows, cols);
auto eigenvalues =
OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
auto eigenvectors =
OutputMatrixMap<ValueType>(eigenvectors_data + i * stride, rows, cols);

Eigen::SelfAdjointEigenSolver<Eigen::Matrix<
ValueType, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
: Eigen::EigenvaluesOnly);
PADDLE_ENFORCE_EQ(
eigen_solver.info(), Eigen::Success,
platform::errors::InvalidArgument(
"Self Adjoint Eigen decomposition is not successful. "
"The %d-th input matrice might not be not be positive definite.",
i));

eigenvalues = eigen_solver.eigenvalues().transpose();
if (has_vectors) {
eigenvectors = eigen_solver.eigenvectors();
}
}
}

template <typename T, typename ValueType>
inline void ComputeComplexEigenvaluesAndVectors(T *x_data,
ValueType *eigenvalues_data,
T *eigenvectors_data,
int batches, int rows, int cols,
bool has_vectors) {
using Complex = std::complex<ValueType>;
Complex *input = reinterpret_cast<Complex *>(x_data);
Complex *eigenvectors_data_ = reinterpret_cast<Complex *>(eigenvectors_data);

int stride = rows * cols;
for (int i = 0; i < batches; i++) {
auto m = InputMatrixMap<Complex>(input + i * stride, rows, cols);
auto eigenvalues =
OutputMatrixMap<ValueType>(eigenvalues_data + i * rows, 1, rows);
auto eigenvectors =
OutputMatrixMap<Complex>(eigenvectors_data_ + i * stride, rows, cols);

Eigen::SelfAdjointEigenSolver<
Eigen::Matrix<Complex, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
eigen_solver(m, has_vectors ? Eigen::ComputeEigenvectors
: Eigen::EigenvaluesOnly);
PADDLE_ENFORCE_EQ(
eigen_solver.info(), Eigen::Success,
platform::errors::InvalidArgument(
"Self Adjoint Eigen decomposition is not successful. "
"The %d-th input matrice might not be not be positive definite.",
i));

eigenvalues = eigen_solver.eigenvalues().transpose();
if (has_vectors) {
eigenvectors = eigen_solver.eigenvectors();
}
}
}

inline int64_t GetBatchSize(framework::DDim dims) {
int64_t batch_size = 1;
auto dim_size = dims.size();
Expand All @@ -128,37 +51,75 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, ValueType, T> {
void operator()(const framework::ExecutionContext &ctx, const Tensor &input,
Tensor *eigen_values, Tensor *eigen_vectors, bool is_lower,
bool has_vectors) {
auto dims = input.dims();
auto output_value_dim = eigen_values->dims();
auto *out_value = eigen_values->mutable_data<ValueType>(ctx.GetPlace());
auto *out_vector = eigen_vectors->mutable_data<T>(ctx.GetPlace());

int64_t batch_size = 1;
auto dims = input.dims();
int dim_size = dims.size();
for (int64_t i = 0; i < dim_size - 2; i++) {
batch_size *= dims[i];
}
int64_t batch_size = GetBatchSize(dims);

auto dito =
DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(ctx);
Tensor input_tensor;
TensorCopy(input, ctx.GetPlace(), &input_tensor);
if (!is_lower) {
input_tensor = dito.Transpose(input);
math::DeviceIndependenceTensorOperations<platform::CPUDeviceContext, T>(
ctx);
Tensor output_v_var_trans = dito.Transpose(input);
TensorCopy(output_v_var_trans, ctx.GetPlace(), eigen_vectors);

int vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
int values_stride = dims[dim_size - 1];
char uplo = is_lower ? 'L' : 'U';
char jobz = has_vectors ? 'V' : 'N';
auto n = dims[dim_size - 1];
auto lda = std::max<int64_t>(1, n);

int lwork = -1;
int lrwork = -1;
int liwork = -1;
int iwork_buffer = -1;
T lwork_buffer = static_cast<T>(-1);
ValueType rwork_buffer = static_cast<ValueType>(-1);

Tensor info_tensor;
auto *infos_data = info_tensor.mutable_data<int>(
framework::make_ddim({batch_size}), ctx.GetPlace());

math::lapackEvd<T, ValueType>(jobz, uplo, n, out_vector, lda, out_value,
&lwork_buffer, lwork, &rwork_buffer, lrwork,
&iwork_buffer, liwork, infos_data);

lwork = std::max<int>(1, static_cast<int>(lwork_buffer));
liwork = std::max<int>(1, iwork_buffer);

Tensor rwork_tensor;
ValueType *rwork_data = nullptr;

// complex type
if (framework::IsComplexType(eigen_vectors->type())) {
lrwork = std::max<int>(1, static_cast<int>(rwork_buffer));
rwork_data = rwork_tensor.mutable_data<ValueType>(
framework::make_ddim({lrwork}), ctx.GetPlace());
}
int rows = dims[dims.size() - 2];

auto *value_data =
eigen_values->mutable_data<ValueType>(output_value_dim, ctx.GetPlace());
Tensor iwork_tensor, work_tensor;
auto *iwork_data = iwork_tensor.mutable_data<int>(
framework::make_ddim({liwork}), ctx.GetPlace());
auto *work_data = work_tensor.mutable_data<T>(framework::make_ddim({lwork}),
ctx.GetPlace());

if (framework::IsComplexType(input_tensor.type())) {
auto *x_data = input_tensor.data<T>();
auto *vector_data = eigen_vectors->mutable_data<T>(dims, ctx.GetPlace());
ComputeComplexEigenvaluesAndVectors<T, ValueType>(
x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
} else {
auto *x_data = input_tensor.data<ValueType>();
auto *vector_data =
eigen_vectors->mutable_data<ValueType>(dims, ctx.GetPlace());
ComputeFloatEigenvaluesAndVectors<ValueType>(
x_data, value_data, vector_data, batch_size, rows, rows, has_vectors);
for (auto i = 0; i < batch_size; i++) {
auto *value_data = out_value + i * values_stride;
auto *vector_data = out_vector + i * vector_stride;
int *info_ptr = &infos_data[i];
math::lapackEvd<T, ValueType>(jobz, uplo, n, vector_data, lda, value_data,
work_data, lwork, rwork_data, lrwork,
iwork_data, liwork, info_ptr);
PADDLE_ENFORCE_EQ(
*info_ptr, 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: the [%d] argument had an illegal value", i,
*info_ptr));
}
if (has_vectors) {
*eigen_vectors = dito.Transpose(*eigen_vectors);
}
}
};
Expand Down
47 changes: 45 additions & 2 deletions paddle/fluid/operators/math/lapack_function.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,58 @@ namespace math {

// LU (for example)
template <>
void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
void lapackLu<double>(int m, int n, double* a, int lda, int* ipiv, int* info) {
platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
}

template <>
void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
void lapackLu<float>(int m, int n, float* a, int lda, int* ipiv, int* info) {
platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
}

template <>
void lapackEvd<float, float>(char jobz, char uplo, int n, float* a, int lda,
float* w, float* work, int lwork, float* rwork,
int lrwork, int* iwork, int liwork, int* info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}

template <>
void lapackEvd<double, double>(char jobz, char uplo, int n, double* a, int lda,
double* w, double* work, int lwork,
double* rwork, int lrwork, int* iwork,
int liwork, int* info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}

template <>
void lapackEvd<paddle::platform::complex<float>, float>(
char jobz, char uplo, int n, paddle::platform::complex<float>* a, int lda,
float* w, paddle::platform::complex<float>* work, int lwork, float* rwork,
int lrwork, int* iwork, int liwork, int* info) {
platform::dynload::cheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<float>*>(a), &lda, w,
reinterpret_cast<std::complex<float>*>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}

template <>
void lapackEvd<paddle::platform::complex<double>, double>(
char jobz, char uplo, int n, paddle::platform::complex<double>* a, int lda,
double* w, paddle::platform::complex<double>* work, int lwork,
double* rwork, int lrwork, int* iwork, int liwork, int* info) {
platform::dynload::zheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<double>*>(a), &lda,
w, reinterpret_cast<std::complex<double>*>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}

} // namespace math
} // namespace operators
} // namespace paddle
7 changes: 6 additions & 1 deletion paddle/fluid/operators/math/lapack_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@ namespace math {

// LU (for example)
template <typename T>
void lapackLu(int m, int n, T *a, int lda, int *ipiv, int *info);
void lapackLu(int m, int n, T* a, int lda, int* ipiv, int* info);

template <typename T, typename ValueType>
void lapackEvd(char jobz, char uplo, int n, T* a, int lda, ValueType* w,
T* work, int lwork, ValueType* rwork, int lrwork, int* iwork,
int liwork, int* info);

} // namespace math
} // namespace operators
Expand Down
21 changes: 20 additions & 1 deletion paddle/fluid/platform/dynload/lapack.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License. */
#pragma once

#include <mutex>
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/port.h"

Expand All @@ -26,6 +27,20 @@ extern "C" void dgetrf_(int *m, int *n, double *a, int *lda, int *ipiv,
int *info);
extern "C" void sgetrf_(int *m, int *n, float *a, int *lda, int *ipiv,
int *info);
extern "C" void zheevd_(char *jobz, char *uplo, int *n, std::complex<double> *a,
int *lda, double *w, std::complex<double> *work,
int *lwork, double *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void cheevd_(char *jobz, char *uplo, int *n, std::complex<float> *a,
int *lda, float *w, std::complex<float> *work,
int *lwork, float *rwork, int *lrwork, int *iwork,
int *liwork, int *info);
extern "C" void dsyevd_(char *jobz, char *uplo, int *n, double *a, int *lda,
double *w, double *work, int *lwork, int *iwork,
int *liwork, int *info);
extern "C" void ssyevd_(char *jobz, char *uplo, int *n, float *a, int *lda,
float *w, float *work, int *lwork, int *iwork,
int *liwork, int *info);

namespace paddle {
namespace platform {
Expand Down Expand Up @@ -58,7 +73,11 @@ extern void *lapack_dso_handle;

#define LAPACK_ROUTINE_EACH(__macro) \
__macro(dgetrf_); \
__macro(sgetrf_);
__macro(sgetrf_); \
__macro(zheevd_); \
__macro(cheevd_); \
__macro(dsyevd_); \
__macro(ssyevd_);

LAPACK_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_LAPACK_WRAP);

Expand Down

0 comments on commit 9e4cf0d

Please sign in to comment.