NVIDIA · gevtushenko · Jul 14, 2023 · Jul 14, 2023
@@ -444,7 +444,7 @@ public:
         ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
         int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
     {
-        // Determine if we scan skip bounds checking
+        // Determine if we skip bounds checking
         if (num_valid >= BLOCK_THREADS)
         {
             return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
@@ -585,7 +585,7 @@ public:
         T   input,                  ///< [in] Calling thread's input
         int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
     {
-        // Determine if we scan skip bounds checking
+        // Determine if we skip bounds checking
         if (num_valid >= BLOCK_THREADS)
         {
             return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);

@@ -125,7 +125,7 @@ struct BlockReduceRaking
 
     template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
     __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        ReductionOp                 reduction_op,       ///< [in] Binary reduction operator
         T                           *raking_segment,
         T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
         int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
@@ -142,7 +142,7 @@ struct BlockReduceRaking
 
     template <bool IS_FULL_TILE, typename ReductionOp>
     __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary reduction operator
         T                           * /*raking_segment*/,
         T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
         int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)

@@ -97,7 +97,7 @@ struct BlockReduceRakingCommutativeOnly
             typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
         } default_storage;
 
-        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block reduction
     };
 
 
@@ -144,7 +144,7 @@ struct BlockReduceRakingCommutativeOnly
                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
                 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
 
-                // Warpscan
+                // Warp reduction
                 partial = WarpReduce(temp_storage.default_storage.warp_storage).Sum(partial);
             }
         }
@@ -181,7 +181,7 @@ struct BlockReduceRakingCommutativeOnly
                 T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.default_storage.raking_grid, linear_tid);
                 partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
 
-                // Warpscan
+                // Warp reduction
                 partial = WarpReduce(temp_storage.default_storage.warp_storage).Reduce(partial, reduction_op);
             }
         }

@@ -79,8 +79,8 @@ struct BlockReduceWarpReductions
     /// Shared memory storage layout type
     struct _TempStorage
     {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous reduction
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous reduction
         T                                   block_prefix;               ///< Shared prefix for the entire thread block
     };
 
@@ -108,7 +108,7 @@ struct BlockReduceWarpReductions
 
     template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
     __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        ReductionOp                 reduction_op,       ///< [in] Binary reduction operator
         T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
         int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
         Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
@@ -123,7 +123,7 @@ struct BlockReduceWarpReductions
 
     template <bool FULL_TILE, typename ReductionOp>
     __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary reduction operator
         T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
         int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
         Int2Type<WARPS>     /*successor_warp*/)
@@ -137,7 +137,7 @@ struct BlockReduceWarpReductions
         bool                FULL_TILE,
         typename            ReductionOp>
     __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
         T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
         int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
     {

@@ -70,9 +70,9 @@ __device__ __forceinline__ AccumT ThreadReduce(
 /**
  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
  *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * \tparam LENGTH       LengthT of input array
+ * \tparam T            <b>[inferred]</b> The data type to be reduced.
+ * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
  */
 template <
     int         LENGTH,
@@ -92,9 +92,9 @@ __device__ __forceinline__ AccumT ThreadReduce(
 /**
  * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
  *
- * \tparam LENGTH     LengthT of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * \tparam LENGTH       LengthT of input array
+ * \tparam T            <b>[inferred]</b> The data type to be reduced.
+ * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
  */
 template <
     int         LENGTH,
@@ -112,9 +112,9 @@ __device__ __forceinline__ T ThreadReduce(
 /**
  * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
  *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * \tparam LENGTH       <b>[inferred]</b> LengthT of \p input array
+ * \tparam T            <b>[inferred]</b> The data type to be reduced.
+ * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
  */
 template <
     int         LENGTH,
@@ -134,9 +134,9 @@ __device__ __forceinline__ AccumT ThreadReduce(
 /**
  * \brief Serial reduction with the specified operator
  *
- * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ * \tparam LENGTH       <b>[inferred]</b> LengthT of \p input array
+ * \tparam T            <b>[inferred]</b> The data type to be reduced.
+ * \tparam ReductionOp  <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
  */
 template <
     int         LENGTH,

@@ -390,7 +390,7 @@ struct WarpReduceShfl
 
 
     //---------------------------------------------------------------------
-    // Templated inclusive scan iteration
+    // Templated reduction iteration
     //---------------------------------------------------------------------
 
     template <typename ReductionOp, int STEP>

@@ -62,7 +62,7 @@ struct WarpReduceSmem
         /// Whether the logical warp size is a power-of-two
         IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
 
-        /// The number of warp scan steps
+        /// The number of warp reduction steps
         STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
 
         /// The number of threads in half a warp