From 7da90a1f41e58cf7ed72040b0e8e918e8afc5f70 Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Wed, 18 May 2022 17:58:06 +0000
Subject: [PATCH 1/3] 1st commit

---
 .../kernels/funcs/values_vectors_functor.h    | 85 +++++++++++--------
 paddle/phi/kernels/transpose_kernel.h         |  2 +-
 2 files changed, 50 insertions(+), 37 deletions(-)
diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index 336e9c809427c..b5ddb7d0303dc 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -27,10 +27,10 @@
 namespace phi {
 namespace funcs {
 
-inline int64_t GetBatchSize(phi::DDim dims) {
+inline int64_t GetBatchSize(const phi::DDim &dims) {
   int64_t batch_size = 1;
   auto dim_size = dims.size();
-  for (int i = 0; i < dim_size - 2; i++) {
+  for (int i = 0; i < dim_size - 2; ++i) {
     batch_size *= dims[i];
   }
   return batch_size;
@@ -54,6 +54,23 @@ static void CheckEighResult(const int batch, const int info) {
           info));
 }
 
+#ifdef PADDLE_WITH_CUDA
+static void CheckEighResult(const GPUContext &dev_ctx,
+                            const int64_t batch_size,
+                            int *info) {
+  std::vector<int> error_info(batch_size);
+  paddle::memory::Copy(phi::CPUPlace(),
+                       error_info.data(),
+                       dev_ctx.GetPlace(),
+                       info,
+                       sizeof(int) * batch_size,
+                       dev_ctx.stream());
+  for (auto i = 0; i < batch_size; ++i) {
+    CheckEighResult(i, error_info[i]);
+  }
+}
+#endif
+
 template <typename DeviceContext, typename T>
 struct MatrixEighFunctor {
   void operator()(const DeviceContext &dev_ctx,
@@ -95,7 +112,8 @@ struct MatrixEighFunctor<CPUContext, T> {
     char jobz = has_vectors ? 'V' : 'N';
     int n = dims[dim_size - 1];
     int64_t lda = std::max<int64_t>(1, n);
-    // if work = -1, it means that you need to use the lapack function to query
+    // if work = -1, it means that you need to use the lapack function to
+    // query
     // the optimal value
     int lwork = -1;      // The length of the array work
     int lrwork = -1;     // The dimension of the array rwork,rwork is REAL array
@@ -188,97 +206,92 @@ struct MatrixEighFunctor<GPUContext, T> {
                   bool is_lower,
                   bool has_vectors) {
     using ValueType = phi::dtype::Real<T>;
-    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
 
-    DenseTensor input_trans;
-    input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
-    T *input_vector = input_trans.data<T>();
+    int workspace_size = 0;
     auto &dims = input.dims();
     int dim_size = dims.size();
     int64_t batch_size = GetBatchSize(dims);
+    int last_dim = dims[dim_size - 1];
+    int lda = std::max<int>(1, last_dim);
+    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
+    auto values_stride = dims[dim_size - 1];
 
     cublasFillMode_t uplo =
         is_lower ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
     cusolverEigMode_t jobz =
         has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR;
 
-    int n = dims[dim_size - 1];
-    int lda = std::max<int>(1, n);
-    auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2];
-    auto values_stride = dims[dim_size - 1];
-    int lwork = 0;
+    ValueType *out_value = dev_ctx.template Alloc<ValueType>(eigen_values);
     auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size);
     auto *info_ptr = reinterpret_cast<int *>(info->ptr());
 
-    // When the input type is float32, and the feature value input dimension
-    // is greater than or equal to [*,32,32]  and less than or equal to
-    // [*,512,512], Syevj has better performance.
+    DenseTensor input_trans = phi::TransposeLast2Dim<T>(dev_ctx, input);
+    T *input_vector = input_trans.data<T>();
+
+    // Once input data type is float32, and the last dimension of
+    // input is located in range [32, 512], Syevj works better.
     bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 &&
                       values_stride >= 32 && values_stride <= 512);
+    auto handle = dev_ctx.cusolver_dn_handle();
+
     syevjInfo_t syevj_params;
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
           dynload::cusolverDnCreateSyevjInfo(&syevj_params));
+
       PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize(
           dev_ctx.cusolver_dn_handle(),
           jobz,
           uplo,
-          n,
+          last_dim,
           reinterpret_cast<const float *>(input_vector),
           lda,
           reinterpret_cast<const float *>(out_value),
-          &lwork,
+          &workspace_size,
           syevj_params));
     } else {
       EvdBuffer(dev_ctx.cusolver_dn_handle(),
                 jobz,
                 uplo,
-                n,
+                last_dim,
                 input_vector,
                 lda,
                 out_value,
-                &lwork);
+                &workspace_size);
     }
-    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * lwork);
+    auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size);
     auto *work_ptr = reinterpret_cast<T *>(work->ptr());
-    for (auto i = 0; i < batch_size; i++) {
+
+    for (auto i = 0; i < batch_size; ++i) {
       auto *input_data = input_vector + i * vector_stride;
       auto *value_data = out_value + i * values_stride;
-      auto handle = dev_ctx.cusolver_dn_handle();
       if (use_syevj) {
         PADDLE_ENFORCE_GPU_SUCCESS(
             dynload::cusolverDnSsyevj(handle,
                                       jobz,
                                       uplo,
-                                      n,
+                                      last_dim,
                                       reinterpret_cast<float *>(input_data),
                                       lda,
                                       reinterpret_cast<float *>(value_data),
                                       reinterpret_cast<float *>(work_ptr),
-                                      lwork,
-                                      info_ptr,
+                                      workspace_size,
+                                      &info_ptr[i],
                                       syevj_params));
       } else {
         Evd(handle,
             jobz,
             uplo,
-            n,
+            last_dim,
             input_data,
             lda,
             value_data,
             work_ptr,
-            lwork,
-            info_ptr);
+            workspace_size,
+            &info_ptr[i]);
       }
-      int error_info = 0;
-      paddle::memory::Copy(phi::CPUPlace(),
-                           &error_info,
-                           dev_ctx.GetPlace(),
-                           info_ptr,
-                           sizeof(int),
-                           dev_ctx.stream());
-      CheckEighResult(i, error_info);
     }
+    CheckEighResult(dev_ctx, batch_size, info_ptr);
 
     if (use_syevj) {
       PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index b8d7fbaa2757d..17432e250eec1 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -35,7 +35,7 @@ DenseTensor Transpose(const Context& dev_ctx,
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   TransposeInferMeta(x, axis, &meta_out);
-  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  phi::TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
 

From d67a7627bdc8a2d5d4b78186385f4a95ac5777af Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Thu, 19 May 2022 00:34:21 +0000
Subject: [PATCH 2/3] fix usless change in header transpose_kernel_h file

---
 paddle/phi/kernels/transpose_kernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 17432e250eec1..b8d7fbaa2757d 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -35,7 +35,7 @@ DenseTensor Transpose(const Context& dev_ctx,
   DenseTensor dense_out;
   MetaTensor meta_out(&dense_out);
   TransposeInferMeta(x, axis, &meta_out);
-  phi::TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
 

From b51b637986da4cefb6a49df6d031536af63b093b Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Wed, 25 May 2022 03:49:00 +0000
Subject: [PATCH 3/3] add sync

---
 paddle/phi/kernels/funcs/values_vectors_functor.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h
index b5ddb7d0303dc..a6a6d4097030b 100644
--- a/paddle/phi/kernels/funcs/values_vectors_functor.h
+++ b/paddle/phi/kernels/funcs/values_vectors_functor.h
@@ -65,6 +65,7 @@ static void CheckEighResult(const GPUContext &dev_ctx,
                        info,
                        sizeof(int) * batch_size,
                        dev_ctx.stream());
+  dev_ctx.Wait();
   for (auto i = 0; i < batch_size; ++i) {
     CheckEighResult(i, error_info[i]);
   }