From a24f2aa8d0681e8a940aa686bb5f80f70fb71671 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 20 Jun 2022 09:20:32 +0000
Subject: [PATCH] opt gather and scatter

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 31 +++++---
 .../kernels/sparse/gpu/coalesced_kernel.cu    | 36 ++++++---
 .../phi/kernels/sparse/gpu/convolution.cu.h   |  1 -
 .../sparse/gpu/convolution_grad_kernel.cu     | 73 +++++++++++------
 .../kernels/sparse/gpu/convolution_kernel.cu  | 78 +++++++++++++------
 5 files changed, 150 insertions(+), 69 deletions(-)
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index b9568f1df716d..48b12f8a1b6de 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+#define VecBytes 16
 
 namespace phi {
 namespace funcs {
@@ -28,33 +31,37 @@ namespace sparse {
  * channels: the output channel size
  * out: the outputs
  **/
-template <typename T>
+template <typename T, int VecSize>
 __global__ void ScatterKernel(const T* input,
                               const int* unique_value,
                               const int* out_index,
                               const int non_zero_num,
                               const int rulebook_len,
                               const int channels,
-                              T* out,
-                              const bool subm = false) {
+                              T* out) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
+  const int vec_channels = channels / VecSize;
+  using  LoadT = phi::AlignedVector<T, VecSize>;
+  using  StoreT = phi::AlignedVector<T, VecSize>;
+  for (int i = tid; i < non_zero_num * vec_channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / vec_channels;
+    int channels_i = i - indices_i * vec_channels;
 
     int start = unique_value[indices_i];
     int end = indices_i == non_zero_num - 1 ? rulebook_len
                                             : unique_value[indices_i + 1];
     // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
+    StoreT sums={static_cast<T>(0)};
     for (int j = start; j < end; j++) {
       const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
+      LoadT vec_in;
+      phi::Load<T, VecSize>(input + out_feature_i * channels + channels_i * VecSize, &vec_in);
+#pragma unroll
+      for(int k = 0; k < VecSize; k++){
+        sums[k] += vec_in[k];
+      }
     }
-    out[indices_i * channels + channels_i] = sum;
+    phi::Store<T, VecSize>(sums, out + indices_i * channels + channels_i * VecSize); 
   }
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 7d9e566916add..44ecb4365a9c1 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -132,16 +132,32 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 5. scatter the values
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  phi::funcs::sparse::ScatterKernel<T>
-      <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
-          x_values_ptr,
-          public_indexs.data<int>(),
-          values_indexs_ptr,
-          out_nnz,
-          nnz,
-          stride,
-          out_values.data<T>());
+  const int VecSize = VecBytes / sizeof(T);
+  if(stride % VecSize == 0){
+      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
+              nnz * stride / VecSize, 1);
+      phi::funcs::sparse::ScatterKernel<T, VecSize>
+          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+                  x_values_ptr,
+                  public_indexs.data<int>(),
+                  values_indexs_ptr,
+                  out_nnz,
+                  nnz,
+                  stride,
+                  out_values.data<T>());
+  }else{
+      config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx,
+              nnz * stride, 1);
+      phi::funcs::sparse::ScatterKernel<T, 1>
+          <<<config.block_per_grid, config.thread_per_block, 0, dev_ctx.stream()>>>(
+                  x_values_ptr,
+                  public_indexs.data<int>(),
+                  values_indexs_ptr,
+                  out_nnz,
+                  nnz,
+                  stride,
+                  out_values.data<T>());
+  }
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 74d3108806a2a..7f0d4814b75d0 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -59,7 +59,6 @@ __global__ void GatherKernel(const T* params,
     int slice_i = i - indices_i * vec_slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
     int64_t params_i = gather_i * slice_size + slice_i*VecSize;
-    //*(output + i) = *(params + params_i);
     using  LoadT = phi::AlignedVector<T, VecSize>;
     using  StoreT = phi::AlignedVector<T, VecSize>;
     LoadT params_vec;
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 0b89d4a2dc269..ebf2bf6cae896 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -137,28 +138,56 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
-  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * in_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + rulebook_len,
-                                              in_features_ptr,
-                                              rulebook_len,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if(in_channels % VecSize == 0){
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * in_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  in_features_ptr,
+                  rulebook_len,
+                  in_channels);
+  }else{
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * in_channels, 1);
+      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len,
+                  in_features_ptr,
+                  rulebook_len,
+                  in_channels);
+  }
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
-      dev_ctx, rulebook_len * out_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)>
-      <<<config.block_per_grid.x,
-         config.thread_per_block.x,
-         0,
-         dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
-                             rulebook_ptr + rulebook_len * 2,
-                             out_grad_features_ptr,
-                             rulebook_len,
-                             out_channels);
+  if(out_channels % VecSize == 0){
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * out_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize>
+          <<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len * 2,
+                  out_grad_features_ptr,
+                  rulebook_len,
+                  out_channels);
+  }else{
+      auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+              dev_ctx, rulebook_len * out_channels, 1);
+      GatherKernel<T, IntT, 1>
+          <<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(out_grad.non_zero_elements().data<T>(),
+                  rulebook_ptr + rulebook_len * 2,
+                  out_grad_features_ptr,
+                  rulebook_len,
+                  out_channels);
+  }
 
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
@@ -203,7 +232,7 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   }
 
   // 4. scatter
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * in_channels, 1);
 
   phi::funcs::ScatterCUDAKernel<<<config.block_per_grid,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 4a6dd00dc1b0d..c2942bdfd2b63 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -110,16 +110,30 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   phi::funcs::SetConstant<GPUContext, T> set_zero;
   set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
 
-  auto config =
-      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels/sizeof(T), 1);
-  GatherKernel<T, IntT, sizeof(T)><<<config.block_per_grid.x,
-                          config.thread_per_block.x,
-                          0,
-                          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
-                                              rulebook_ptr + n,
-                                              in_features_ptr,
-                                              n,
-                                              in_channels);
+  const int VecSize = VecBytes / sizeof(T);
+  if(in_channels % VecSize == 0){
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels / VecSize, 1);
+      GatherKernel<T, IntT, VecSize><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + n,
+                  in_features_ptr,
+                  n,
+                  in_channels);
+  }else{
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
+      GatherKernel<T, IntT, 1><<<config.block_per_grid.x,
+          config.thread_per_block.x,
+          0,
+          dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
+                  rulebook_ptr + n,
+                  in_features_ptr,
+                  n,
+                  in_channels);
+  }
 
   // 3. call gemm for every werght
   auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
@@ -155,7 +169,7 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // 4. scatter
   if (subm) {
     set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-    config =
+    auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * out_channels, 1);
     phi::funcs::ScatterCUDAKernel<T, IntT>
         <<<config.block_per_grid,
@@ -168,19 +182,35 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
                                out_channels,
                                false);
   } else {
-    config = phi::backends::gpu::GetGpuLaunchConfig1D(
-        dev_ctx, out->nnz() * out_channels, 1);
-    phi::funcs::sparse::ScatterKernel<T>
-        <<<config.block_per_grid.x,
-           config.thread_per_block.x,
-           0,
-           dev_ctx.stream()>>>(out_features_ptr,
-                               unique_value.data<int>(),
-                               out_index.data<int>(),
-                               out->nnz(),
-                               n,
-                               out_channels,
-                               out_values_ptr);
+      if(out_channels % VecSize == 0){
+          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+                  dev_ctx, out->nnz() * out_channels / VecSize, 1);
+          phi::funcs::sparse::ScatterKernel<T, VecSize>
+              <<<config.block_per_grid.x,
+              config.thread_per_block.x,
+              0,
+              dev_ctx.stream()>>>(out_features_ptr,
+                      unique_value.data<int>(),
+                      out_index.data<int>(),
+                      out->nnz(),
+                      n,
+                      out_channels,
+                      out_values_ptr);
+      }else{
+          auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
+                  dev_ctx, out->nnz() * out_channels, 1);
+          phi::funcs::sparse::ScatterKernel<T, 1>
+              <<<config.block_per_grid.x,
+              config.thread_per_block.x,
+              0,
+              dev_ctx.stream()>>>(out_features_ptr,
+                      unique_value.data<int>(),
+                      out_index.data<int>(),
+                      out->nnz(),
+                      n,
+                      out_channels,
+                      out_values_ptr);
+      }
   }
 }
 /**