PaddlePaddle · qili93 · Feb 21, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
@@ -763,12 +763,6 @@ function(hip_library TARGET_NAME)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
-      # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators"
-              OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
-        set_source_files_properties(${hip_library_SRCS}
-                                    PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-      endif()
       if(hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
       else()
@@ -782,6 +776,10 @@ function(hip_library TARGET_NAME)
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
+        if(NOT ${source_file} MATCHES "\\.cu$")
+          set_source_files_properties(${source_file}
+                                      PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        endif()
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
           list(APPEND hip_library_HEADERS

diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
@@ -134,8 +134,11 @@ if(WITH_GPU)
     DEPS ${PHI_DEPS})
 
 elseif(WITH_ROCM)
-  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
-  target_link_libraries(phi ${PHI_DEPS})
+  hip_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+
 elseif(WITH_XPU_KP)
   xpu_library(
     phi ${PHI_BUILD_TYPE}

diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -227,6 +227,32 @@ if(NOT WITH_CUDNN_FRONTEND)
     "fusion/gpu/max_pool2d_v2_kernel.cu")
 endif()
 
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "gpu/affine_grid_grad_kernel.cu"
+    "gpu/apply_per_channel_scale_kernel.cu"
+    "gpu/cholesky_solve_kernel.cu"
+    "gpu/eigh_kernel.cu"
+    "gpu/eigvalsh_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/lu_kernel.cu"
+    "gpu/matrix_rank_kernel.cu"
+    "gpu/matrix_rank_tol_kernel.cu"
+    "gpu/multiclass_nms3_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
+    "gpu/qr_kernel.cu"
+    "gpu/svd_kernel.cu"
+    "gpudnn/mha_cudnn_frontend.cu"
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_kernel.cu"
+    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+endif()
+
 set(cc_search_pattern
     "*.cc"
     "cpu/*.cc"

diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -15,4 +15,9 @@ if(WITH_GPU OR WITH_ROCM)
     "*.cu")
 endif()
 
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
+endif()
+
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/gpu/binomial_kernel.cu b/paddle/phi/kernels/gpu/binomial_kernel.cu
@@ -74,8 +74,13 @@ __device__ int64_t btrs(
   const T m = std::floor((n + 1) * p);
 
   while (1) {
+#ifdef __NVCC__
     U = static_cast<T>(curand_uniform(&state)) - 0.5;
     V = static_cast<T>(curand_uniform(&state));
+#elif __HIPCC__
+    U = static_cast<T>(hiprand_uniform(&state)) - 0.5;
+    V = static_cast<T>(hiprand_uniform(&state));
+#endif
 
     us = 0.5 - std::abs(U);
     k = static_cast<int64_t>(std::floor((2 * a / us + b) * U + c));
@@ -118,7 +123,11 @@ __device__ int64_t binomial_inversion(
 #endif
 
   while (1) {
+#ifdef __NVCC__
     unif = static_cast<T>(curand_uniform(&state));
+#elif __HIPCC__
+    unif = static_cast<T>(hiprand_uniform(&state));
+#endif
     T geom = std::ceil(std::log(unif) / logprob);
     geom_sum += geom;
     if (geom_sum > n) {

diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -310,7 +310,7 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
       if (*beam >= MaxLength) break;
     } else {
 #ifdef PADDLE_WITH_HIP
-      uint64 mask = 0;
+      unsigned mask = 0u;
       mask = __ballot(true);
       if (tid_max / WARP_SIZE == wid) {
         if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)

diff --git a/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/triangular_solve_grad_kernel_impl.h"
 
+#ifdef PADDLE_WITH_CUDA
 PD_REGISTER_KERNEL(triangular_solve_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -23,3 +24,12 @@ PD_REGISTER_KERNEL(triangular_solve_grad,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+#else  // PADDLE_WITH_HIP
+// blas_impl.hip.h not support CUBlas<T>::TRSM for complex
+PD_REGISTER_KERNEL(triangular_solve_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveGradKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu
@@ -123,6 +123,7 @@ void TriangularSolveKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_CUDA
 PD_REGISTER_KERNEL(triangular_solve,
                    GPU,
                    ALL_LAYOUT,
@@ -131,3 +132,12 @@ PD_REGISTER_KERNEL(triangular_solve,
                    double,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+#else  // PADDLE_WITH_HIP
+// blas_impl.hip.h not support CUBlas<T>::TRSM for complex
+PD_REGISTER_KERNEL(triangular_solve,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::TriangularSolveKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -26,7 +26,12 @@
 #include <iostream>
 #include <vector>
 
+#ifdef PADDLE_WITH_CUDA
 #include "cub/cub.cuh"
+#else
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"

diff --git a/paddle/phi/kernels/gpudnn/pool_gpudnn.h b/paddle/phi/kernels/gpudnn/pool_gpudnn.h
@@ -35,13 +35,21 @@ class CudnnIndexType;
 template <>
 class CudnnIndexType<int> {
  public:
-  static const cudnnDataType_t type = CUDNN_DATA_INT32;
+#ifdef PADDLE_WITH_CUDA
+  static const dnnDataType_t type = CUDNN_DATA_INT32;
+#else
+  static const dnnDataType_t type = miopenInt32;
+#endif
 };
 
 template <>
 class CudnnIndexType<int8_t> {
  public:
-  static const cudnnDataType_t type = CUDNN_DATA_INT8;
+#ifdef PADDLE_WITH_CUDA
+  static const dnnDataType_t type = CUDNN_DATA_INT8;
+#else
+  static const dnnDataType_t type = miopenInt8;
+#endif
 };
 
 inline GPUDNNDataLayout GetLayoutFromStr(std::string data_format) {