PaddlePaddle · sneaxiy · Sep 16, 2022 · Sep 5, 2022 · Sep 7, 2022 · Sep 7, 2022
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -70,11 +70,12 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:

diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -70,8 +70,9 @@ namespace platform {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:

diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -62,11 +62,12 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends

diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -65,8 +65,9 @@ namespace gpu {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends